In [41]:
import pandas as pd
import numpy as np
import nltk

In [42]:
df = pd.read_csv("review_min150.csv", header = 0, names=['Review','Rating'])

In [43]:
df.shape

(15327, 2)

In [44]:
X = df.iloc[:,0].map(lambda x: " ".join([token for token,pos in nltk.pos_tag(nltk.word_tokenize(x)) 
                                      if pos.startswith('N') or pos.startswith('J')]))
Y = df.iloc[:,1]

In [45]:
vocab_list = pd.DataFrame(X.str.cat(sep=' '),columns = ['vocab'],index = [1])


In [46]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer( lowercase=True, ngram_range = (1,1), stop_words='english', analyzer = 'word')
z = count_vectorizer.fit_transform(vocab_list['vocab']).toarray()
z = z.flatten()

In [47]:
vocab = pd.DataFrame({'features': count_vectorizer.get_feature_names(), 'count': z}).sort_values(by=['count'], ascending=False).iloc[0:600,1].tolist()

In [48]:
vocab

['good',
 'place',
 'food',
 'coffee',
 'time',
 'great',
 'cream',
 'chocolate',
 'nice',
 'tea',
 'service',
 'little',
 'ice',
 'menu',
 'location',
 'order',
 'sweet',
 'fresh',
 'area',
 'small',
 'store',
 'cheese',
 'way',
 'day',
 'people',
 'chicken',
 'items',
 'lot',
 'bit',
 'delicious',
 'best',
 'friendly',
 'flavor',
 'staff',
 'new',
 'cake',
 'hot',
 'shop',
 'lunch',
 'bar',
 'sandwich',
 'flavors',
 'sauce',
 'free',
 'sure',
 'selection',
 'beer',
 'bread',
 'better',
 'large',
 'salad',
 'different',
 'restaurant',
 'drink',
 'spot',
 'home',
 'meat',
 'dessert',
 'prices',
 'table',
 'milk',
 'drinks',
 'big',
 'thing',
 'tasty',
 'visit',
 'taste',
 'options',
 'price',
 'meal',
 'yelp',
 'breakfast',
 'right',
 'tables',
 'counter',
 'things',
 'night',
 'line',
 'cup',
 'butter',
 'rice',
 'decent',
 'favorite',
 'special',
 'house',
 'quality',
 'places',
 'experience',
 'happy',
 'bad',
 'red',
 'bakery',
 'parking',
 'friend',
 'green',
 'cafe',
 'times',
 '

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1, max_df = 0.8, lowercase=True, ngram_range = (1,1), use_idf=True, stop_words='english', analyzer = 'word', vocabulary = vocab)
temp = pd.DataFrame(vectorizer.fit_transform(X).toarray(), index=X.index ,columns=vectorizer.get_feature_names())

In [50]:
temp.head()

Unnamed: 0,good,place,food,coffee,time,great,cream,chocolate,nice,tea,...,game,east,types,mexican,heat,cozy,wings,higher,actual,job
1,0.099838,0.0,0.127646,0.0,0.193125,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.082706,0.0,0.0,0.12046,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.063538,0.139594,0.0,0.092542,0.0,0.0821,0.0,0.0,0.0,0.576692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.06369,0.0,0.0,0.0,0.0,0.286867,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.075771,0.088188,0.0,0.088951,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
senti = df['Review'].apply(lambda Text: pd.Series(sid.polarity_scores(Text)['compound']))
senti.columns = ["Polarity"]

In [52]:
temp = temp.join(senti)
temp = temp.join(Y)

In [53]:
temp.to_csv('reviewTable150.csv', index= False)