In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pylab as plt
import math
from scipy import stats
import geopandas as gpd
%matplotlib inline
from functools import reduce

In [2]:
import importlib
import mr_word_count
importlib.reload(mr_word_count)
from mr_word_count import MRWordFrequencyCount
from mrjob.job import MRJob
import mapreduce as mr

In [3]:
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [55]:
df = pd.read_csv('InwoodYelpDescription.csv')

In [59]:
df.head()

Unnamed: 0,zipcode,all_reviews,Neighborhood
0,10034,Great crowd and really good music. I went ther...,Inwood
1,10034,I went there on a Wednesday \nGreat experience...,Inwood
2,10034,This is my second review but wanted to update ...,Inwood
3,10034,Sigh. Another uptown establishment overrun by ...,Inwood
4,10034,This place is nice and enjoyable if you do not...,Inwood


In [61]:
# tokenizer turns each string in the description column to a list of words

tokenizer = RegexpTokenizer('\s+', gaps=True)

In [62]:
df['all_reviews'] = df['all_reviews'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [63]:
df['all_reviews'].head(10)

0    [great, crowd, and, really, good, music., i, w...
1    [i, went, there, on, a, wednesday, great, expe...
2    [this, is, my, second, review, but, wanted, to...
3    [sigh., another, uptown, establishment, overru...
4    [this, place, is, nice, and, enjoyable, if, yo...
5    [inwood, bar, &, grill, is, one, my, favorite,...
6    [after, a, long, walk, in, ft, tryon, park,, w...
7    [i've, been, here, once, and, i, promised, mys...
8    [oh, tenth, avenue!, who, the, hell, would've,...
9    [loved, it, here., they, have, a, simple, and,...
Name: all_reviews, dtype: object

In [64]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [65]:
df['all_reviews'] = df['all_reviews'].apply(lambda x: remove_stopwords(x))

In [66]:
# lemmatizer not totally sure what it does but it helps with the getting the gist of the word
# gotta read the towards data science post

lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [67]:
# if you'll notice here, we don't set the column equal to the updated applied version - I think that results in errors
# I tried before, it didn't work, and had to rerun code from the beginning, more than once

df['all_reviews'].apply(lambda x: word_lemmatizer(x))

0     [great, crowd, really, good, music., went, bru...
1     [went, wednesday, great, experience!, afterwor...
2     [second, review, wanted, update, ate, mother's...
3     [sigh., another, uptown, establishment, overru...
4     [place, nice, enjoyable, wait, 2, hour, play, ...
5     [inwood, bar, &, grill, one, favorite, restaur...
6     [long, walk, ft, tryon, park,, spied, cañave.,...
7     [i've, promised, first, last, time, dj, spinni...
8     [oh, tenth, avenue!, hell, would've, thought, ...
9     [loved, here., simple, straight, forward, happ...
10    [chill/laid, back, vibe, weekend, day., note-,...
11    [bad, customer, service, ., people, scam, mone...
12    [husband, love, indian, road, cafe!, eat, regu...
13    [beautiful, surprise, neighborhood,, excellent...
14    [would, highly, recommend, restaurant, looking...
15    [delivery, time., order, chicharrones, de, cer...
16    [great, cocktails,, warm, friendly, atmosphere...
17    [staff, amazing, customer, service, point.

In [69]:
# Okay, this is where it gets real clunky, and a for loop could've made life easier, but oh well
# I created brand new df's for each neighborhood (just duplicates essentially)
# each letter for a respective neighborhood

df['zipcode'] = df['zipcode'].astype(int)
df['all_reviews'] = df['all_reviews'].astype(str)

In [89]:
# then here, I created a 8 string var's (one for each neighborhood df)
# concatenate all the description strings for each neighborhood into one string

a = df['all_reviews'].str.cat()

In [90]:
# Here, the words_'letter' are lists of all of all the words
# the wordCount_'letter' are dictionaries of the words and their counts
# again, there's 8 of these, cause 8 df's / neighborhoods


from collections import Counter

words_a = a.split()
wordCount_a = Counter(words_a) #Chelsea

In [91]:
# The First Line
# Here, I turned the dictionaries into df's so based on count


# The Second Line
# rename the columns to word and count

# The Third Line
# so I could get a sorted df based on word count

#Chelsea
df1 = pd.DataFrame.from_dict(wordCount_a, orient='index').reset_index()
df1 = df1.rename(columns={"index": "word", 0:'count'})

In [92]:
# First Line -- replace punctuation with a blank space
# Second Line -- update the df so that if the value in the word column is blank, drop it
# Third & 4th Lines -- drop words from the df that only showed up once or twice


df1['word'] = df1['word'].str.replace(r'[^\w\s]+', '')
df1 = df1[df1.word != '']

In [111]:
#df1 = df1.sort_values('count', ascending=False)

In [93]:
df1 = df1.groupby(['word'], as_index=False)['count'].sum()
df1.drop( df1[ df1['count'] == 1 ].index , inplace=True)
df1.drop( df1[ df1['count'] == 2 ].index , inplace=True)
df1.head()

Unnamed: 0,word,count
8,2,3
15,3,3
21,4,5
24,5,5
53,although,3


In [98]:
a = df1.loc[df1['word'] == 'new', 'count'].reset_index(drop=True)
b = df1.loc[df1['word'] == 'york', 'count'].reset_index(drop=True)
c = a.sub(b)
c = c.values

In [99]:
print(a)

Series([], Name: count, dtype: int64)


In [100]:
df1.loc[df1.word == "new", "count"] = c
df1.head()

Unnamed: 0,word,count
8,2,3
15,3,3
21,4,5
24,5,5
53,although,3


In [101]:
len(df1)

164

In [102]:
#indexNames = dfObj[ dfObj['Age'] == 30 ].index
 
# Delete these row indexes from dataFrame
df1.drop(df1[ df1['word'] == 'york' ].index , inplace=True)

In [103]:
df1.head()

Unnamed: 0,word,count
8,2,3
15,3,3
21,4,5
24,5,5
53,although,3


In [104]:
df1 = df1.sort_values('count', ascending=False)
df1.head()

Unnamed: 0,word,count
600,place,52
372,great,41
327,food,28
366,good,21
525,music,17


In [105]:
len(df1)

164

In [106]:
# created new df's that took the top 150 values for each of the 8 neighborhood's / df's
# df100 corresponds to df1 which corresponds to Chelsea, so on and so forth...

df100 = df1.head(150)

In [107]:
df100 = df100.reset_index(drop=True)
df100.head()

Unnamed: 0,word,count
0,place,52
1,great,41
2,food,28
3,good,21
4,music,17


In [54]:
#df100.to_csv('Yelp LES.csv', index = False, header=True)