In [95]:
import numpy as np
import pandas as pd
from glob import glob
# merge csv files
def merge_files(SOURCE):
    files_list = sorted(glob(SOURCE + '/*'))
    df_list = []
    for num, path in enumerate(files_list):
        df = pd.read_csv(path)
        df = df.drop(['business_id','city','date','funny','state','useful','cool'], axis=1)
        df = df.rename(columns={'stars_y':'review_stars'})
        df_list.append(df)
    df = pd.concat(df_list, join='outer', ignore_index=True, axis=0, sort=True)
    return df

SOURCE = 'AZ_top20_categories_csv'
yelp = merge_files(SOURCE)
yelp = yelp.drop(['stars','stars_x'], axis=1)
yelp = yelp.rename(columns={'review_stars':'rating'})
print(yelp.shape)
yelp.head()

(3010486, 3)


Unnamed: 0,categories,rating,text
0,Active Life,5,Douglas W. has it right. You're basically on ...
1,Active Life,2,"Nice place, but they think nothing of running ..."
2,Active Life,5,Absolutely beautiful! I think I could stay her...
3,Active Life,5,I see 4 star ratings and can't figure out how ...
4,Active Life,1,"In the 8 years I have owned my mountain bike, ..."


In [154]:
yelp.dtypes

categories    object
rating         int64
text          object
dtype: object

In [28]:
yelp.isnull().sum()

categories    0
rating        0
text          0
dtype: int64

In [34]:
# yelp['rating'] = yelp['rating'].map({4: 5, 2: 1, 3: 1})
yelp = yelp.drop(['new_rating'], axis=1)
yelp['rating'] = yelp['rating'].replace({4:5, 2:1, 3:1})
yelp['rating'].value_counts()

5    2044942
1     965544
Name: rating, dtype: int64

In [36]:
yelp.head()

Unnamed: 0,categories,rating,text
0,Active Life,5,Douglas W. has it right. You're basically on ...
1,Active Life,1,"Nice place, but they think nothing of running ..."
2,Active Life,5,Absolutely beautiful! I think I could stay her...
3,Active Life,5,I see 4 star ratings and can't figure out how ...
4,Active Life,1,"In the 8 years I have owned my mountain bike, ..."


In [31]:
import pandas as pd
import spacy
import scattertext

In [17]:
yelp['categories'].value_counts().index

Index(['Restaurants', 'Food', 'Nightlife', 'Shopping', 'Beauty & Spas',
       'Home Services', 'Event Planning & Services', 'Automotive',
       'Health & Medical', 'Local Services', 'Fast Food', 'Active Life',
       'Auto Repair', 'Doctors', 'Hair Salons', 'Home & Garden', 'Fashion',
       'Professional Services', 'Real Estate', 'Contractors'],
      dtype='object')

In [54]:
import pandas as pd
pd.set_option('display.max_columns', 500) # Make sure we can see all of the columns
pd.set_option('display.max_rows', 200)

In [96]:
Shopping_yelp = yelp[yelp['categories']=='Shopping']
Shopping_yelp['rating'] = Shopping_yelp['rating'].astype('float')
Shopping_yelp['rating'] = Shopping_yelp['rating'].astype('str')
print(Shopping_yelp.shape)
Shopping_yelp.head()

(166680, 3)


Unnamed: 0,categories,rating,text
2843806,Shopping,1.0,Went out of my way to get paint at this store ...
2843807,Shopping,1.0,This company tried deliver flowers to our home...
2843808,Shopping,1.0,This the worst flower place that I've ever enc...
2843809,Shopping,5.0,A great spot to grab a shake right at 51 Freew...
2843810,Shopping,5.0,We were just in Arizona visiting family and st...


In [111]:
Shopping_yelp['rating'] = Shopping_yelp['rating'].replace({'4.0':'5.0', '2.0':'1.0', '3.0':'1.0'})
Shopping_yelp['rating'].value_counts()/len(Shopping_yelp['rating'])

5.0    0.67146
1.0    0.32854
Name: rating, dtype: float64

In [101]:
Shopping_yelp['rating'].value_counts()/len(Shopping_yelp['rating'])

5.0    0.542945
1.0    0.211225
4.0    0.128516
3.0    0.059209
2.0    0.058105
Name: rating, dtype: float64

In [105]:
Shopping_yelp['rating'].value_counts()

5.0    90498
1.0    35207
4.0    21421
3.0     9869
2.0     9685
Name: rating, dtype: int64

In [104]:
Shopping_yelp_sample = Shopping_yelp.sample(frac=.5)
print(Shopping_yelp_sample.shape)
Shopping_yelp_sample['rating'].value_counts()/len(Shopping_yelp_sample['rating'])

(83340, 3)


5.0    0.540605
1.0    0.212683
4.0    0.128762
3.0    0.059827
2.0    0.058123
Name: rating, dtype: float64

In [108]:
Shopping_yelp_sample.dtypes

categories    object
rating        object
text          object
dtype: object

In [109]:
Shopping_yelp_sample['rating'] = Shopping_yelp_sample['rating'].replace({'4.0':'5.0', '2.0':'1.0', '3.0':'1.0'})
Shopping_yelp_sample['rating'].value_counts()

5.0    55785
1.0    27555
Name: rating, dtype: int64

In [110]:
Shopping_yelp_sample['rating'].value_counts()/len(Shopping_yelp_sample['rating'])

5.0    0.669366
1.0    0.330634
Name: rating, dtype: float64

In [41]:
nlp = spacy.load("en_core_web_lg")

In [112]:
# add stop words
with open('stopwords.txt', 'r') as f:
    str = f.read()
    set_stopwords = set(str.split('\n'))
nlp.Defaults.stop_words |= set_stopwords

In [None]:
corpus = (scattertext.CorpusFromPandas(Shopping_yelp_sample, 
                                      category_col='rating', 
                                      text_col='text',
                                      nlp=nlp)
          .build()
          .remove_terms(nlp.Defaults.stop_words, ignore_absences=True)
         )

In [None]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['highratingscore'] = corpus.get_scaled_f_scores('5.0')
term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores('1.0')

df_high = term_freq_df.sort_values(by='highratingscore', 
                                   ascending = False)
df_poor = term_freq_df.sort_values(by='poorratingscore', 
                                   ascending=False)

# df_high = df_high[['highratingscore', 'poorratingscore']]
df_high['highratingscore'] = round(df_high['highratingscore'], 2)
df_high['poorratingscore'] = round(df_high['poorratingscore'], 2)
df_high = df_high.reset_index(drop=False)
# df_high = df_high.head(20)

# df_poor = df_poor[['highratingscore', 'poorratingscore']]
df_poor['highratingscore'] = round(df_poor['highratingscore'], 2)
df_poor['poorratingscore'] = round(df_poor['poorratingscore'], 2)
df_poor = df_poor.reset_index(drop=False)
# df_poor = df_poor.head(20)

# df_terms = pd.concat([df_high, df_poor],
#                      ignore_index=True)
# df_terms

In [None]:
Shopping_yelp_sample_high = df_high
Shopping_yelp_sample_poor = df_poor

In [119]:
Shopping_yelp_sample_high.sort_values(by='5.0 freq', ascending = False).head(100)

Unnamed: 0,term,5.0 freq,1.0 freq,highratingscore,poorratingscore
1245604,store,18755,17941,0.12,0.88
1244761,service,15985,13303,0.14,0.86
2483,good,12253,6143,0.91,0.09
2685,new,11188,5889,0.9,0.1
453,friendly,10763,1819,0.97,0.03
1282,staff,10484,3434,0.94,0.06
3255,and the,10252,6134,0.89,0.11
558,best,9815,1912,0.97,0.03
406,helpful,9431,1463,0.97,0.03
452,recommend,9158,1543,0.97,0.03


In [118]:
Shopping_yelp_sample_poor.sort_values(by='1.0 freq', ascending = False).head(100)

Unnamed: 0,term,5.0 freq,1.0 freq,highratingscore,poorratingscore
3886,store,18755,17941,0.12,0.88
4729,service,15985,13303,0.14,0.86
2437,customer,8848,11209,0.09,0.91
408,said,3272,11054,0.03,0.97
1717,$,7074,10771,0.08,0.92
3026,to the,7544,8476,0.1,0.9
3856,went,8079,7766,0.12,0.88
2878,customer service,6603,7624,0.1,0.9
2508,the store,5817,7270,0.09,0.91
3244,on the,6500,7039,0.11,0.89


In [94]:
Auto_Repair_yelp_high.sort_values(by='5.0 freq', ascending = False).head(100)

Unnamed: 0,term,5.0 freq,1.0 freq,highratingscore,poorratingscore
2976,car,40357,41815,0.87,0.13
1849,service,26490,19259,0.91,0.09
1219,work,15144,8105,0.93,0.07
2299,my car,14244,12133,0.89,0.11
2116,new,10707,8581,0.9,0.1
1172,experience,10665,5513,0.94,0.06
1108,shop,9506,4745,0.94,0.06
371,recommend,9271,2003,0.98,0.02
2210,i have,8976,7421,0.89,0.11
3098,of the,8714,9344,0.87,0.13


In [93]:
Auto_Repair_yelp_poor.sort_values(by='1.0 freq', ascending = False).head(100)

Unnamed: 0,term,5.0 freq,1.0 freq,highratingscore,poorratingscore
881356,car,40357,41815,0.87,0.13
882483,service,26490,19259,0.91,0.09
463,$,5162,16854,0.05,0.95
372,said,3996,14426,0.04,0.96
1747,the car,7244,12467,0.1,0.9
882033,my car,14244,12133,0.89,0.11
2857,vehicle,8222,10800,0.13,0.87
2537,dealership,7487,10499,0.12,0.88
2932,it was,7809,10111,0.13,0.87
2167,oil,6387,9750,0.11,0.89


In [86]:
Hair_Salons_yelp_high.sort_values(by='5.0 freq', ascending = False).head(100)

Unnamed: 0,term,5.0 freq,1.0 freq,highratingscore,poorratingscore
1502,hair,42750,15626,0.92,0.08
1470,my hair,22150,7970,0.92,0.08
1031,salon,16158,4370,0.94,0.06
1565,cut,15273,5798,0.92,0.08
1171,i have,10864,3286,0.93,0.07
1334,color,9242,3090,0.93,0.07
1763,stylist,8204,3432,0.91,0.09
195,best,7913,571,0.99,0.01
766,i 've,7710,1642,0.95,0.05
626,job,7649,1358,0.96,0.04


In [2]:
Hair_Salons_yelp_poor.sort_values(by='1.0 freq', ascending = False).head(100)

NameError: name 'Hair_Salons_yelp_poor' is not defined

In [None]:
Fashion_yelp_high.sort_values(by='5.0 freq', ascending = False)

In [None]:
Fashion_yelp_poor.sort_values(by='1.0 freq', ascending = False).head(100)

In [138]:
Professional_Services_yelp_poor_json = Professional_Services_yelp_poor_json.drop([136,3577,2707,664626,1979,2678])
Professional_Services_yelp_poor_json.sort_values(by='1 freq', ascending = False).head(10)

Unnamed: 0,term,1 freq,5 freq,highratingscore,poorratingscore
2480,service,8908,7581,0.1,0.9
445,$,5996,1606,0.03,0.97
1022,phone,4776,2213,0.06,0.94
1472,called,4287,2560,0.07,0.93
136,cox,3135,371,0.01,0.99
3577,it was,3005,3433,0.14,0.86
2707,to the,2969,2694,0.11,0.89
664626,work,2900,7039,0.93,0.07
1976,did n't,2855,2069,0.09,0.91
2678,that i,2845,2557,0.11,0.89


In [None]:
Professional_Services_yelp_poor.sort_values(by='1 freq', ascending = False).head(100)

In [130]:
Professional_Services_yelp_high_json = Professional_Services_yelp_high_json.drop([1777])
Professional_Services_yelp_best_json = Professional_Services_yelp_high_json.sort_values(by='5 freq', ascending = False).head()

Unnamed: 0,term,1 freq,5 freq,highratingscore,poorratingscore
2721,time,5335,8239,0.89,0.11
302,professional,485,4677,0.98,0.02
2636,new,2438,3861,0.89,0.11
2850,home,2143,3174,0.88,0.12
293,friendly,308,3059,0.98,0.02


In [None]:
Professional_Services_yelp_high.sort_values(by='5 freq', ascending = False).head()
Professional_Services_yelp_poor_json.sort_values(by='1 freq', ascending = False).head()

In [153]:
df = df.head()
df.to_json('Fashion_yelp_high_rating_words.json', orient='records', lines=True)