In [1]:
import pandas as pd
import nltk
import re

from itertools import chain
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import spacy
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
stopwords = set(nltk.corpus.stopwords.words('english'))
newStopWords = {}

from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

sball = SnowballStemmer('english')
spor = PorterStemmer()
slan = LancasterStemmer()

import time
import Filter as fil

In [2]:
def reading():

    def readCSV(item):
        data = pd.read_csv(item, dtype={0: int}, index_col=0, encoding='latin-1')
        df = pd.DataFrame(data)
        return df

    restaurants = readCSV('static/revSam.csv')
    
    return restaurants

In [3]:
def randomize(dataset, number):
    dataPart = dataset.sample(n=number, random_state=1)
    
    return dataPart

dataPart = randomize(reading(), 100)

In [4]:
def prepareData(item, granulation):
    
    item = item.loc[:, ['date','text']]
    item = item.sort_values('date', ascending = True)
    if granulation == "Year":
        item['time'] = pd.to_datetime(item.date).dt.year
    elif (granulation == "Month"):
        item['time'] = pd.to_datetime(item.date)
        item['time'] = item.time.map(lambda x: x.strftime('%Y-%m'))
    result = item.loc[:, ['time','text']]
    
    return result

In [5]:
def filterTime(item,time):
    
    result = item.copy()
    indexNames = item[item['time'] != time].index
    result.drop(indexNames , inplace=True)
    result = result.rename(columns={'text': str(time)})
    
    return result[str(time)]

In [49]:
def tokenCount(item, top, getLemma, phrases):
    
    global newStopWords
    if (phrases == "True" or phrases == "true"):
        allWords = item.apply(phrasesTB).tolist() 
        #Here we flatten the list
        allWords = list(chain.from_iterable(allWords))
    else:        
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        allWords = item.apply(tokenizer.tokenize).tolist()        
        allWords = list(chain.from_iterable(allWords)) 
        if getLemma == "True" or getLemma == "true":
            print("Just keep lemming!")
            allWords = lemmatizingSpacy(allWords, True)
        elif getLemma == "stem" or getLemma == "Stem":
            print("Just keep snowing")
            allWords = stemmingSnowball(allWords, True)
        elif getLemma == "port" or getLemma == "Port":
            print("Just keep porting")
            allWords = stemmingPorter(allWords, True)
        elif getLemma == "lanc" or getLemma == "Lanc":
            print("Just keep lancaster")
            allWords = stemmingLancaster(allWords, True)
 
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if (w.lower() not in stopwords) and (w.lower() not in newStopWords)) 

    wordCount = pd.Series(allWordExceptStopDist).to_frame(item.name)
    wordCount.sort_values(by=[item.name], inplace = True, ascending=False)
    if (int(top)> wordCount.size):
        top = wordCount.size
    result = wordCount.head(int(top)).reset_index()
    result = result.rename(columns={result.columns[0]: 'Words: ' + str(item.name)})
    result = result.rename(columns={result.columns[1]: 'Count: ' + str(item.name)})
    
    return result

In [27]:
def tf_idf(item, top, getLemma, phrases, common):
    
    if (phrases == "True" or phrases == "true"):
        mid = wWeighting(item, phrases, common)    
    elif (getLemma == "True" or getLemma == "true"):
        
        print("Just keep lemming!")
        item = item.apply(removeSWSentance, args=[True])
        item = item.apply(lemmatizingSpacy, args=[False])
        mid = wWeighting(item, phrases, common)    
    elif getLemma == "stem" or getLemma == "Stem":
        print("Just keep snowing")
        item = item.apply(removeSWSentance, args=[True])
        item = item.apply(stemmingSnowball, args=[False])
        mid = wWeighting(item, phrases, common)  
    else:
        item = item.apply(removeSWSentance, args=[False])
        mid = wWeighting(item, phrases, common)    
        
    wordCount = pd.Series(mid).to_frame(item.name)
    wordCount.sort_values(by=[item.name], inplace = True, ascending=False)    
    
    if (int(top)> wordCount.size):
        top = wordCount.size
    result = wordCount.head(int(top)).reset_index()
    result = result.rename(columns={result.columns[0]: 'Words: ' + str(item.name)})
    result = result.rename(columns={result.columns[1]: 'Count: ' + str(item.name)})
    result = result.round({result.columns[1]: 2})
    
    return result

In [8]:
def frameY(item, common, top, getLemma, frequency, phrases):
    
    t0 = time.time()
    
    global newStopWords
    newStopWords = {}
    start = item.rename(columns={'text': "Overall"})
    
    if frequency == "True" or frequency == "true":
        start = tf_idf(start["Overall"], top, getLemma, phrases, common)
    else:
        start = tokenCount(start["Overall"], top, getLemma, phrases)
    
    if common == "False" or common == "false":
        newStopWords = set(start["Words: Overall"].str.lower())
    
    uniqueTime = item.time.unique()   
    DF_list = [filterTime(item, i) for i in uniqueTime]
    if frequency == "True" or frequency == "true":
        result = pd.concat([tf_idf(i, top, getLemma, phrases, common) for i in DF_list], axis=1)
    else:
        result = pd.concat([tokenCount(i, top, getLemma, phrases) for i in DF_list], axis=1)
    
    result = pd.concat([start,result], axis=1)
    
    result.index+=1
    result.index.names = ['Place']
    
    t1 = time.time()
    print("frameY :" + str(t1-t0))

    return result

In [9]:
def lemmatizingSpacy(item, isList):
    
    document = spacy.tokens.Doc(nlp.vocab, words=item)
    if(isList):        
        result = [w.lemma_ for w in document]
    else:
        result = " ".join([w.lemma_ for w in document])
    return result

In [46]:
def stemmingSnowball(item, isList):
    
    #ps = nltk.stem.SnowballStemmer('english')  
    ps = sball
    if(isList):        
        result = [ps.stem(w) for w in item]
    else:
        result = " ".join([ps.stem(w) for w in item])
    return result

In [47]:
def stemmingLancaster(item, isList):
    
    #ps = nltk.stem.lancaster.LancasterStemmer() 
    ps = slan
    if(isList):        
        result = [ps.stem(w) for w in item]
    else:
        result = " ".join([ps.stem(w) for w in item])
    return result

In [48]:
def stemmingPorter(item, isList):
    
    #ps = nltk.stem.lancaster.LancasterStemmer() 
    ps = spor
    if(isList):        
        result = [ps.stem(w) for w in item]
    else:
        result = " ".join([ps.stem(w) for w in item])
    return result

In [11]:
def removeSWSentance(sentence, isList):
    global newStopWords
    
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    allWords = tokenizer.tokenize(sentence)
    
    if(isList):
        result = [word for word in allWords if (word.lower() not in stopwords) and (word.lower() not in newStopWords) and (word != "-PRON-")]
    else:
        result = " ".join([word for word in allWords if (word.lower() not in stopwords) and (word.lower() not in newStopWords) and (word != "-PRON-")])
    
    return result

In [12]:
def wWeighting(item, phrases, common):
    
    t0 = time.time()
    if (phrases == "True" or phrases == "true"):
        if (common == "False" or common == "false"):
            vectorizer = TfidfVectorizer(ngram_range=(2,5), stop_words = 'english', preprocessor=remove_stop_phrases)
        else:
            vectorizer = TfidfVectorizer(ngram_range=(2,5), stop_words = 'english')
    else:
        vectorizer = TfidfVectorizer()
    
    matrix = vectorizer.fit_transform(item).todense()
    # transform the matrix to a pandas df
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    # sum over each document (axis=0)
    top_words = matrix.sum(axis=0).sort_values(ascending=False)
    
    t1 = time.time()
    print("wWeighting - " + item.name + " :" + str(t1-t0))
    
    return top_words

In [13]:
def remove_stop_phrases(doc):    
    global newStopWords
    #print(newStopWords)
    stop_phrases = newStopWords
    
    for phrase in stop_phrases:
        doc = re.sub(phrase, "", doc, flags=re.IGNORECASE)
    return doc

In [14]:
def phrasesTB(text):
    value = TextBlob(text).noun_phrases
    return value

In [15]:
def main(item, boolean, top, granulation, getLemma, frequency, phrases):
    mid = prepareData(item, granulation)
    result = frameY(mid, boolean, top, getLemma, frequency, phrases)
    
    return result

In [21]:
filtered = fil.main('', '', "01.01.2010", "31.12.2016", "", "", "").head(1000)
filtered

Unnamed: 0_level_0,business_id,stars,date,text,name,postal_code,ccur
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
444610,4JNXUYY8wbaaDmk3BPzlWw,4,2015-07-25,Located next to the Eiffel tower and across th...,"""Mon Ami Gabi""",89109,7328
439720,4JNXUYY8wbaaDmk3BPzlWw,5,2015-09-13,My wife made reservations for a party of 10. W...,"""Mon Ami Gabi""",89109,7328
439719,4JNXUYY8wbaaDmk3BPzlWw,5,2013-09-22,This place has over 3000+ yelp reviews so we d...,"""Mon Ami Gabi""",89109,7328
439718,4JNXUYY8wbaaDmk3BPzlWw,1,2015-01-21,We made a reservation for 845 we show up at 84...,"""Mon Ami Gabi""",89109,7328
439716,4JNXUYY8wbaaDmk3BPzlWw,5,2013-06-01,OH MY GOD. \r\r\r\nThis place was so delicious...,"""Mon Ami Gabi""",89109,7328
...,...,...,...,...,...,...,...
444856,4JNXUYY8wbaaDmk3BPzlWw,5,2016-04-16,This is my 2nd time visiting Mon Ami Gabi and ...,"""Mon Ami Gabi""",89109,7328
444855,4JNXUYY8wbaaDmk3BPzlWw,4,2011-01-05,"From Wikipedia, ""Snail is a common name for al...","""Mon Ami Gabi""",89109,7328
444841,4JNXUYY8wbaaDmk3BPzlWw,4,2011-01-07,"As a belated birthday present, I wanted to tak...","""Mon Ami Gabi""",89109,7328
444830,4JNXUYY8wbaaDmk3BPzlWw,4,2014-03-23,"Une brasserie vraiment franÃ§ais, on the Las V...","""Mon Ami Gabi""",89109,7328


In [63]:
start = time.time()
nakraq = main(filtered, "true", "10", "Year", "stem", "false", "false")
end = time.time()
print("Svurshi: " + str(end - start))
nakraq

Just keep snowing
Just keep snowing
Just keep snowing
Just keep snowing
Just keep snowing
Just keep snowing
Just keep snowing
Just keep snowing
frameY :3.3381097316741943
Svurshi: 3.348048210144043


Unnamed: 0_level_0,Words: Overall,Count: Overall,Words: 2010,Count: 2010,Words: 2011,Count: 2011,Words: 2012,Count: 2012,Words: 2013,Count: 2013,Words: 2014,Count: 2014,Words: 2015,Count: 2015,Words: 2016,Count: 2016
Place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,good,715,good,66,good,97,good,88,food,127,food,135,great,129,food,122
2,food,712,steak,55,food,82,great,84,good,127,great,125,food,122,good,108
3,great,678,great,53,french,71,food,79,great,120,place,120,good,117,great,100
4,order,563,order,47,restaur,71,order,70,place,106,good,112,order,117,servic,95
5,place,536,food,45,great,67,steak,67,veri,100,steak,104,servic,110,veri,84
6,steak,532,place,39,order,67,servic,64,steak,92,time,95,veri,93,order,83
7,servic,526,french,36,veri,66,veri,57,vega,88,vega,95,steak,83,place,76
8,veri,507,like,35,like,65,place,55,order,88,order,91,restaur,83,steak,73
9,vega,464,time,34,vega,61,french,53,servic,85,servic,88,place,82,restaur,72
10,time,438,view,33,patio,58,vega,50,french,74,view,78,vega,76,time,70


In [270]:
from nltk.corpus import wordnet
class AntonymReplacer(object):
	def replace(self, word, pos=None):
		#Returns the antonym of a word, but only if there is no ambiguity.
		antonyms = set()
		
		for syn in wordnet.synsets(word, pos=pos):
			for lemma in syn.lemmas():
				for antonym in lemma.antonyms():
					antonyms.add(antonym.name())
		print(antonyms)
		if len(antonyms) == 1:
			return antonyms.pop()
		elif len(antonyms) > 1:
			return antonyms.pop()
		else:
			return None
	
	def replace_negations(self, sent):
		#Try to replace negations with antonyms in the tokenized sentence.
		i, l = 0, len(sent)
		words = []
		
		while i < l:
			word = sent[i]
			
			if (word == 'not' or word == 'no' or word == 'never'  or word == 'nor' or word == 'neither') and i+1 < l:
				ant = self.replace(sent[i+1])
				
				if ant:
					words.append(ant)
					i += 2
					continue
			
			words.append(word)
			i += 1
		
		return words
    
replacer = AntonymReplacer()
replacer.replace_negations(["musn't", 'antonym', 'uglify', 'our', 'ugly'])
#replacer.replace('rarely')

["musn't", 'antonym', 'uglify', 'our', 'ugly']

In [303]:
from textblob import TextBlob
#testimonial = TextBlob()
#testimonial.sentiment

def detect_polarity(text):
    return TextBlob(text).sentiment.polarity


filtered2 = filtered.copy()

filtered2['polarity'] = filtered2.text.apply(detect_polarity)
filtered2

detect_polarity("The most infuriating!")

#davidim = filtered2.apply(TextBlob()).tolist()  

-0.125

In [137]:
def lemmatizingSentance(sentence):
    
    global newStopWords
    
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    document = tokenizer.tokenize(sentence)
    doc = spacy.tokens.Doc(nlp.vocab, words=document)
    
    result = " ".join([token.lemma_ for token in doc if (token.lemma_.lower() not in stopwords) and (token.lemma_.lower() not in newStopWords) and (token.lemma_ != "-PRON-")])
    
    return result

In [2]:
documentA = 'The man went out for a walk'
documentB = 'the children sat around the fire'

In [8]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0


In [5]:
[documentA, documentB]

['the man went out for a walk', 'the children sat around the fire']