In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import warnings
import networkx

warnings.filterwarnings('ignore')

import re
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from data_preprocesser import preprocessed_data_path, create_file, split_raw_csv_review_file

from textblob import TextBlob
verbs = ['VB','VBD','VBG','VBN','VBP','VBZ']
others = ['RBR','RBS','UH','FW','JJ','JJR','JJS']
tags = ['NN','NNS','NNP','NNPS']

raw_data_path = './data_yelp/raw/'
preprocessed_data_path = './data_yelp/preprocessed/'
reviews_type_path = 'categories_30000/yelp_academic_dataset_review_Auto Repair.csv'

# Processing functions

In [2]:
# Separer en phrases avec ponctuation




def identify_tokens(row):
    sentences = row['sentences']
    token_words = []
    for sentence in sentences :
        tokens = nltk.word_tokenize(sentence)
        # taken only words (not punctuation)
        # token_words = [re.sub("[^\w\s]", " ", w) for w in tokens]
        token_words.append([w for w in tokens if w.isalpha()])
    return token_words


def split_to_sentences(row):
    review = row['text']
    delimiters = ['?','!','.',',',';',':','\n','"','(',')']
    regexPattern = '|'.join(map(re.escape, delimiters))
    sentences = re.split(regexPattern,review)
    sentences = [sentence for sentence in sentences if sentence and sentence != ' ']
    return sentences

### Garder juste les noms, verbes, adjectifs et interjections

def select_tagged(row):
    sentences = row['sentences']
    useful_words = []
    for sentence in sentences :
        list_words = sentence.split(' ')
        list_words = [w for w in list_words if w]
        tagged_words = nltk.pos_tag(list_words)
        useful_words.append([w[0] for w in tagged_words if w[1] in tags and w[0] != 'i' and w[0] != 'u'])
    return useful_words

stop_words = stopwords.words('english')
stops_empiric = ['i','u',"it's","don't","they're",'wa',"didn't",'was','done','were','went','had','got','has','am',"i'm","i've"]
stops_empiric.extend(stop_words)



def remove_stops(row):
    sentences = row['words_useful']
    useful_words = []

    for sentence in sentences :
        new_sentence = []
        for w in sentence :
            if w not in stops_empiric :
                new_sentence.append(w)
        useful_words.append(new_sentence)
    return useful_words

### Enlever les mots trops polarisés

def unpolarized(row):
    sentences = row['words_useful']
    unpolarized_words = []
    for sentence in sentences :
        unpolarized_words.append([w for w in sentence if (abs(TextBlob(w).sentiment.polarity)<0.2 
            and abs(TextBlob(w).sentiment.polarity<0.2))])
    return unpolarized_words

stemming = PorterStemmer()

def stem_list(row):
    sentences = row['words_meaningful']
    stemmed_sentences = []
    for sentence in sentences :
        stemmed_sentences.append([stemming.stem(word) for word in sentence])
    return (stemmed_sentences)

lemming = WordNetLemmatizer()

def lem_list(row):
    sentences = row['words_meaningful']
    lemmed_sentences = []
    for sentence in sentences :
        lemmed_sentences.append([lemming.lemmatize(word) for word in sentence])
    return (lemmed_sentences)

def rejoin_words(row):
    sentences = row['words_meaningful']
    joined_words = []
    for sentence in sentences :
        joined_words.extend(sentence)
    return joined_words

def index_words(row):
    sentences = row['words_meaningful']
    all_words = row['sentences']
    indexes = []
    for j,sentence in enumerate(sentences) :
        indexes.append([(w,all_words[j].index(w)) for w in sentence])
    return indexes

In [3]:
def process(data_df):
    data_df['text'] = data_df['text'].str.lower()
    print(1)
    data_df['sentences'] = data_df.apply(split_to_sentences,axis=1)
    print(2)
    data_df['words'] = data_df.apply(identify_tokens,axis=1)
    print(3)
    data_df['words_useful'] = data_df.apply(select_tagged,axis=1)
    print(4)
    data_df['words_useful'] = data_df.apply(remove_stops,axis=1)
    print('4bis')
    data_df['words_meaningful'] = data_df.apply(unpolarized, axis=1)
    print(5)
    data_df['index_meaningful'] = data_df.apply(index_words,axis=1)
    print(6)
    data_df['words_lemmatized'] = data_df.apply(lem_list, axis=1)
    print(7)
    data_df['joined_words'] = data_df.apply(rejoin_words,axis=1)

    return data_df

In [4]:
df = pd.read_csv('data_yelp/preprocessed/categories_30000/yelp_academic_dataset_review_Auto Repair.csv')
processed_df = process(df)
processed_df.head(5)

1
2
3
4
4bis
5
6
7


Unnamed: 0,review_id,business_id,stars_x,stars_y,date,city,text,categories,sentences,words,words_useful,words_meaningful,index_meaningful,words_lemmatized,joined_words
0,JN0tmjiksWfr4i2NIFuPXw,pLZ9oZM8c6MNbRlg06lBPg,1,3.0,2016-08-03,Las Vegas,they sell wrecked vehicles! \ni purchased a 20...,Auto Repair;Car Dealers;Automotive;Auto Parts ...,"[they sell wrecked vehicles, i purchased a 201...","[[they, sell, wrecked, vehicles], [i, purchase...","[[vehicles], [altima], [miles, planet, nissan]...","[[vehicles], [altima], [miles, planet, nissan]...","[[(vehicles, 18)], [(altima, 26)], [(miles, 4)...","[[vehicle], [altima], [mile, planet, nissan], ...","[vehicles, altima, miles, planet, nissan, warr..."
1,lFmz61DNRzBPnrwH_cF42g,J0KZR0DDnvj73D-xmbh4uA,4,4.0,2017-06-15,Las Vegas,i stopped in here recently as i had to buy a t...,Auto Parts & Supplies;Auto Repair;Automotive,[i stopped in here recently as i had to buy a ...,"[[i, stopped, in, here, recently, as, i, had, ...","[[torque, wrench], [staff], [bit, money, tool]]","[[torque, wrench], [staff], [bit, money, tool]]","[[(torque, 45), (wrench, 52)], [(staff, 5)], [...","[[torque, wrench], [staff], [bit, money, tool]]","[torque, wrench, staff, bit, money, tool]"
2,R9p3ug8zNPvawYX_jsO0UA,Tknj3u-3CZTI-q52saTAIA,1,1.0,2012-09-12,Toronto,"if you want the ""i don't give a shit "" attitud...",Car Dealers;Auto Repair;Automotive,"[if you want the , i don't give a shit , atti...","[[if, you, want, the], [i, do, give, a, shit],...","[[], [shit], [attitude, service, staff, car, s...","[[], [], [attitude, service, staff, car, servi...","[[], [], [(attitude, 1), (service, 27), (staff...","[[], [], [attitude, service, staff, car, servi...","[attitude, service, staff, car, service, exper..."
3,FMXER_6R6URYBpV6TYPn9A,rV1nkTx58v_q7YGDkGzfxQ,5,5.0,2016-06-10,Las Vegas,where to start...\na few months ago i slammed....,Auto Repair;Automotive,"[where to start, a few months ago i slammed, m...","[[where, to, start], [a, few, months, ago, i, ...","[[], [months], [truck, curb], [wheel], [truck,...","[[], [months], [truck, curb], [wheel], [truck,...","[[], [(months, 6)], [(truck, 3), (curb, 16)], ...","[[], [month], [truck, curb], [wheel], [truck, ...","[months, truck, curb, wheel, truck, tire, repl..."
4,bJN2ZPa0jF4xgR1gPKZP3A,hCXknMQalY1QQZrdZGuBHg,3,3.5,2017-01-24,Las Vegas,i have mixed feelings here. i recently purchas...,Car Dealers;Auto Repair;Automotive,"[i have mixed feelings here, i recently purch...","[[i, have, mixed, feelings, here], [i, recentl...","[[feelings], [xj], [one, price, test, man, jag...","[[feelings], [xj], [one, price, test, man, jag...","[[(feelings, 13)], [(xj, 25)], [(one, 10), (pr...","[[feeling], [xj], [one, price, test, man, jag,...","[feelings, xj, one, price, test, man, jag, rid..."


### Save as a csv

In [5]:
path = 'data_yelp/preprocessed/yelp_academic_dataset_review_Auto Repair.csv'

processed_df.to_csv(path)



## Build graph

In [127]:
reviews_with_sentences = processed_df['words_lemmatized']

graph = {}
for review in reviews_with_sentences :
    for sentence in review :
        for w1 in sentence :
            if w1 in graph.keys() :
                for w2 in sentence :
                    if w2 in graph[w1].keys() :
                        graph[w1][w2] += 1
                    else :
                        graph[w1][w2] = 1
            else :
                graph[w1] = {}
                for w2 in sentence :
                    graph[w1][w2] = 1
print(graph['vehicle'])
            

        



## Graph with NetworkX

In [34]:
import networkx as nx
reviews_with_sentences = processed_df['words_lemmatized']
all_words = processed_df['joined_words'].tolist()
all_words = [inner for outer in all_words for inner in outer]

def create_graph(n):
    fdist = FreqDist(all_words)
    frequent_words = fdist.most_common(n)
    words = [word[0] for word in frequent_words]
    frequencies = [word[1] for word in frequent_words]
    fdist = FreqDist(all_words)
    G = nx.Graph()
    for i,review in enumerate(reviews_with_sentences) :
        for sentence in review :
            for w1 in sentence :
                if w1 in G.nodes :
                    for w2 in sentence :
                        if w2 in words :
                            if w2 in G.adj[w1] :
                                try :
                                    index = words.index(w1)
                                    G.edges[w1,w2]['weight'] += 1/frequencies[index]
                                except ValueError :
                                    print('Nope')
                            else :
                                if w2 not in G.nodes :
                                    G.add_node(w2)
                                    index = words.index(w2)
                                    weight = 1/frequencies[index]
                                    G.add_weighted_edges_from([(w1,w2,weight)])
                else :
                    if w1 in words :
                        G.add_node(w1)
                        for w2 in sentence :
                            if w2 not in G.nodes and w2 in words:
                                G.add_node(w2)
                                index = words.index(w2)
                                weight = 1/frequencies[index]
                                G.add_weighted_edges_from([(w1,w2,weight)])
    return G
G = create_graph(50)

#plt.subplot(121)
#nx.draw(G, with_labels=True, font_weight='bold')


## Test Louvain

In [15]:
help(community)

#from networkx.algorithms.community import greedy_modularity_communities

#first compute the best partition
#partition = greedy_modularity_communities(G)
G = community.best_partition(G)
#print(partition)


Help on package community:

NAME
    community

PACKAGE CONTENTS
    app

FILE
    c:\users\victo\appdata\local\programs\python\python37\lib\site-packages\community\__init__.py




AttributeError: module 'community' has no attribute 'best_partition'