# <u>Yelp Rating Prediction Using Tensorflow</u>

## **Data Cleaning:**

### *Libraries*

In [1]:
import numpy as np
import pandas as pd
import os
import sklearn.feature_extraction.text as sk_text
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


### *Functions*

In [2]:
# preprocess text function -> if you get nltk error:  open Anaconda prompt -> ipython -> import nltk -> nltk.download('popular') OR command line -> python -m nltk.downloader popular
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.lower()                 # lowercase
    text = re.sub(r'\d+', '', text)     # remove numbers
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = re.sub(r'\s+', ' ', text)    # remove extra whitespace
    text = text.strip()                 # remove leading/trailing whitespace
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]          # remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens] # lemmatize
    filtered_text = ' '.join(lemmatized_tokens)                                    # join back into text
    return filtered_text

### *Get reviews*

In [3]:
df = pd.read_json('data/yelp_academic_dataset_review.json', lines=True, chunksize=1000) # smaller chunksize helps with memory issues
df = pd.concat(df)
print(df.head())

                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0      3       0      0     0   
1      5       1      0     1   
2      3       0      0     0   
3      5       1      0     1   
4      4       1      0     1   

                                                text                date  
0  If you decide to eat here, just be aware it is... 2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year... 2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delicious.

### *Clean reviews of businesses with 20 or more reviews*

In [4]:
# Convert all missing values and zeroes in specified column to median -> from labs with slight modifications
def missing_median(df, name):
    non_zero_values = df[name][df[name] != 0]
    med = non_zero_values.median()
    df[name] = df[name].replace(0, med)
    df[name] = df[name].fillna(med)

business_review_count = df.groupby('business_id').size().reset_index(name='review_count')                        # count reviews for each business
businesses_with_20 = business_review_count[business_review_count['review_count'] >= 20]                          # filter businesses with 20 or more reviews
df = pd.merge(df, businesses_with_20, on='business_id', how='inner')                                             # merge with reviews to get only reviews for businesses with 20 or more reviews
df_businesses = df.copy()                                                                                        # save a copy of the dataframe for 3-5 business analysis
df_businesses2 = df.copy()                                                                                       # save a copy of the dataframe for 3-5 business analysis
df = df.drop(['business_id', 'review_id', 'user_id', 'funny', 'cool', 'useful', 'date', 'review_count'], axis=1) # drop unnecessary columns
missing_median(df, 'stars')                                                                                      # in case there are any reviews with 0 or missing stars, replace with median
print(df.head())

   stars                                               text
0      3  If you decide to eat here, just be aware it is...
1      2  This is the second time we tried turning point...
2      4  The place is cute and the staff was very frien...
3      3  We came on a Saturday morning after waiting a ...
4      2  Mediocre at best. The decor is very nice, and ...


### *Get five random businesses with 20 or more reviews*

In [5]:
path = './business_data/'
filename_read = os.path.join(path, 'df_five_businesses_info.csv')
if not os.path.exists(filename_read):
    df_businesses = df_businesses.drop(['review_id', 'user_id', 'funny', 'cool', 'useful', 'date', 'review_count', 'text', 'stars'], axis=1) # drop unnecessary columns
    unique_business_ids = df_businesses['business_id'].unique()                                                                              # get unique business ids
    np.random.shuffle(unique_business_ids)                                                                                                   # shuffle the unique business ids
    five_unique_business_ids = unique_business_ids[:5]                                                                                       # get first 5 unique business ids
    five_unique_ids_df = pd.DataFrame(five_unique_business_ids, columns=['business_id'])                                                     # create a dataframe with the first 5 unique business ids
    print('five unique business ids:', five_unique_business_ids)
    business_info_df = pd.read_json('data/yelp_academic_dataset_business.json', lines=True, chunksize=1000)                                  # read business info
    five_businesses_df = pd.concat([chunk[chunk['business_id'].isin(five_unique_business_ids)] for chunk in business_info_df])
    five_businesses_df = five_businesses_df.drop(['address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'hours', 'attributes', 'is_open'], axis=1)
else:
    five_businesses_df = pd.read_csv(filename_read)
print(five_businesses_df)


              business_id                       name  stars  review_count  \
0  Jv8lYSZPxY0rzkSpgo7BIw  Big Boyz Burgers and More    4.0            22   
1  7k9qGQyytbGxpJTnwxK6Xg         QDOBA Mexican Eats    3.0            40   
2  v_vqna00z6WqKcIJZDkbAw                    C Nails    4.5            61   
3  zsQ1_PNV3KN0EWhAE-WV9g  Tohono Chul Garden Bistro    4.0           165   
4  v2L2HnZzYvHPgFcVBg2TUw             Summerland Inn    2.0            56   

                                          categories  
0  American (Traditional), Diners, Breakfast & Br...  
1  Restaurants, Event Planning & Services, Mexica...  
2   Waxing, Hair Removal, Nail Salons, Beauty & Spas  
3  American (New), Venues & Event Spaces, Restaur...  
4  Hotels, Hotels & Travel, Bed & Breakfast, Even...  


### *Save five businesses' dataframe*

In [6]:
path = "./business_data/"
filename_read = os.path.join(path, "df_five_businesses_info.csv")
if not os.path.exists(filename_read):
    filename_write = os.path.join(path, "df_five_businesses_info.csv")
    five_businesses_df.to_csv(filename_write, index=False, encoding='utf-8') # using default encoding also worked -> used for next cell but wasn't helpful
    print("Wrote file to {}".format(filename_write))

### *Separate business reviews by business name*

In [4]:
path = './business_data/'
filename_read = os.path.join(path, 'df_business1_reviews.csv')
if not os.path.exists(filename_read):
    reviews_for_five_businesses = df_businesses2[df_businesses2['business_id'].isin(five_unique_business_ids)]
    reviews_for_five_businesses = reviews_for_five_businesses.drop(['review_id', 'user_id', 'funny', 'cool', 'useful', 'date', 'review_count'], axis=1)
    reviews_with_names_df = reviews_for_five_businesses.merge(five_businesses_df[['business_id', 'name']], on='business_id', how='left')
    business_names = reviews_with_names_df['name'].unique()
    business1_reviews_df = reviews_with_names_df[reviews_with_names_df['name'] == business_names[0]]
    business2_reviews_df = reviews_with_names_df[reviews_with_names_df['name'] == business_names[1]]
    business3_reviews_df = reviews_with_names_df[reviews_with_names_df['name'] == business_names[2]]
    business4_reviews_df = reviews_with_names_df[reviews_with_names_df['name'] == business_names[3]]
    business5_reviews_df = reviews_with_names_df[reviews_with_names_df['name'] == business_names[4]]
else:
    filename_read = os.path.join(path, 'df_business1_reviews.csv')
    business1_reviews_df = pd.read_csv(filename_read)
    filename_read = os.path.join(path, 'df_business2_reviews.csv')
    business2_reviews_df = pd.read_csv(filename_read)
    filename_read = os.path.join(path, 'df_business3_reviews.csv')
    business3_reviews_df = pd.read_csv(filename_read)
    filename_read = os.path.join(path, 'df_business4_reviews.csv')
    business4_reviews_df = pd.read_csv(filename_read)
    filename_read = os.path.join(path, 'df_business5_reviews.csv')
    business5_reviews_df = pd.read_csv(filename_read)
print(business1_reviews_df.head())

              business_id  stars  \
0  Jv8lYSZPxY0rzkSpgo7BIw      5   
1  Jv8lYSZPxY0rzkSpgo7BIw      4   
2  Jv8lYSZPxY0rzkSpgo7BIw      5   
3  Jv8lYSZPxY0rzkSpgo7BIw      5   
4  Jv8lYSZPxY0rzkSpgo7BIw      5   

                                                text  \
0  The double cheeseburger is delicious. The serv...   
1  Tried this spot here while in town on business...   
2  I had the pleasure of eating here today for th...   
3  Read a review in the Riverfront Times so I dec...   
4  I called in my ordered of Special Fried Rice o...   

                        name  
0  Big Boyz Burgers and More  
1  Big Boyz Burgers and More  
2  Big Boyz Burgers and More  
3  Big Boyz Burgers and More  
4  Big Boyz Burgers and More  


In [5]:
print(business2_reviews_df.head())

              business_id  stars  \
0  7k9qGQyytbGxpJTnwxK6Xg      5   
1  7k9qGQyytbGxpJTnwxK6Xg      2   
2  7k9qGQyytbGxpJTnwxK6Xg      5   
3  7k9qGQyytbGxpJTnwxK6Xg      5   
4  7k9qGQyytbGxpJTnwxK6Xg      2   

                                                text                name  
0  Just like Chipotle! Food was great, very filli...  QDOBA Mexican Eats  
1  I love Qdoba but this is kinda dirty, especial...  QDOBA Mexican Eats  
2  Best burrito place in the world. Better than a...  QDOBA Mexican Eats  
3  I like it.  the enviornment is clean and food ...  QDOBA Mexican Eats  
4  I eat here often and i love their tacos, but t...  QDOBA Mexican Eats  


In [6]:
print(business3_reviews_df.head())

              business_id  stars  \
0  v_vqna00z6WqKcIJZDkbAw      5   
1  v_vqna00z6WqKcIJZDkbAw      1   
2  v_vqna00z6WqKcIJZDkbAw      5   
3  v_vqna00z6WqKcIJZDkbAw      5   
4  v_vqna00z6WqKcIJZDkbAw      5   

                                                text     name  
0  I was looking for a new salon in the Westchase...  C Nails  
1  Horrible service, had to go to another salon t...  C Nails  
2  Great gel manicures! Current one is going on o...  C Nails  
3  I have only gotten their shellac manicure so f...  C Nails  
4  After reading reviews and looking around I fou...  C Nails  


In [7]:
print(business4_reviews_df.head())

              business_id  stars  \
0  zsQ1_PNV3KN0EWhAE-WV9g      3   
1  zsQ1_PNV3KN0EWhAE-WV9g      4   
2  zsQ1_PNV3KN0EWhAE-WV9g      5   
3  zsQ1_PNV3KN0EWhAE-WV9g      3   
4  zsQ1_PNV3KN0EWhAE-WV9g      5   

                                                text  \
0  If you're looking for a restaurant to bring ou...   
1  We decided to try again recently,and things ar...   
2  We came across this brunch place & absolutely ...   
3  Great atmosphere and food was flavorful. But t...   
4  This is my favorite brunch place for sure! The...   

                        name  
0  Tohono Chul Garden Bistro  
1  Tohono Chul Garden Bistro  
2  Tohono Chul Garden Bistro  
3  Tohono Chul Garden Bistro  
4  Tohono Chul Garden Bistro  


In [8]:
print(business5_reviews_df.head())

              business_id  stars  \
0  v2L2HnZzYvHPgFcVBg2TUw      1   
1  v2L2HnZzYvHPgFcVBg2TUw      1   
2  v2L2HnZzYvHPgFcVBg2TUw      5   
3  v2L2HnZzYvHPgFcVBg2TUw      3   
4  v2L2HnZzYvHPgFcVBg2TUw      5   

                                                text            name  
0  I stayed one night here and I will never come ...  Summerland Inn  
1  Things wrong with this room: large spider webs...  Summerland Inn  
2  I like the place. It's got character. It was 1...  Summerland Inn  
3  This place is interesting. It's very cute outs...  Summerland Inn  
4  The Summerland Inn is so beautiful, and the ow...  Summerland Inn  


### *Save individual business review dataframes*

In [9]:
path = './business_data/'
filename_read = os.path.join(path, 'df_business1_reviews.csv')
if not os.path.exists(filename_read):
    filename_write = os.path.join(path, "df_business1_reviews.csv")
    business1_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
    filename_write = os.path.join(path, "df_business2_reviews.csv")
    business2_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
    filename_write = os.path.join(path, "df_business3_reviews.csv")
    business3_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
    filename_write = os.path.join(path, "df_business4_reviews.csv")
    business4_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
    filename_write = os.path.join(path, "df_business5_reviews.csv")
    business5_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))

### *Process text of business1 reviews and save for model*

In [10]:
path = "./business_data/"
filename_read = os.path.join(path, "df_business1_reviews_processed.csv")
if not os.path.exists(filename_read):
    business1_reviews_df['processed_text'] = business1_reviews_df['text'].apply(preprocess_text)
    business1_reviews_df = business1_reviews_df.drop('text', axis=1)
    filename_write = os.path.join(path, "df_business1_reviews_processed.csv")
    business1_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
else:
    filename_read = os.path.join(path, "df_business1_reviews_processed.csv")
    business1_reviews_df = pd.read_csv(filename_read)
print(business1_reviews_df.head())

              business_id  stars                       name  \
0  Jv8lYSZPxY0rzkSpgo7BIw      5  Big Boyz Burgers and More   
1  Jv8lYSZPxY0rzkSpgo7BIw      4  Big Boyz Burgers and More   
2  Jv8lYSZPxY0rzkSpgo7BIw      5  Big Boyz Burgers and More   
3  Jv8lYSZPxY0rzkSpgo7BIw      5  Big Boyz Burgers and More   
4  Jv8lYSZPxY0rzkSpgo7BIw      5  Big Boyz Burgers and More   

                                      processed_text  
0  double cheeseburger delicious service welcomin...  
1  tried spot town business great burger hand for...  
2  pleasure eating today first time honestly one ...  
3  read review riverfront time decided swing cuti...  
4  called ordered special fried rice saturday tol...  


### *Process text of business2 reviews and save for model*

In [11]:
path = "./business_data/"
filename_read = os.path.join(path, "df_business2_reviews_processed.csv")
if not os.path.exists(filename_read):
    business2_reviews_df['processed_text'] = business2_reviews_df['text'].apply(preprocess_text)
    business2_reviews_df = business2_reviews_df.drop('text', axis=1)
    filename_write = os.path.join(path, "df_business2_reviews_processed.csv")
    business2_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
else:
    filename_read = os.path.join(path, "df_business2_reviews_processed.csv")
    business2_reviews_df = pd.read_csv(filename_read)
print(business2_reviews_df.head())

              business_id  stars                name  \
0  7k9qGQyytbGxpJTnwxK6Xg      5  QDOBA Mexican Eats   
1  7k9qGQyytbGxpJTnwxK6Xg      2  QDOBA Mexican Eats   
2  7k9qGQyytbGxpJTnwxK6Xg      5  QDOBA Mexican Eats   
3  7k9qGQyytbGxpJTnwxK6Xg      5  QDOBA Mexican Eats   
4  7k9qGQyytbGxpJTnwxK6Xg      2  QDOBA Mexican Eats   

                                      processed_text  
0  like chipotle food great filling tasty got qui...  
1  love qdoba kinda dirty especially behind count...  
2  best burrito place world better real mexican p...  
3  like enviornment clean food fresh taco salad g...  
4  eat often love taco girl checkout counter toda...  


### *Process text of business3 reviews and save for model*

In [12]:
path = "./business_data/"
filename_read = os.path.join(path, "df_business3_reviews_processed.csv")
if not os.path.exists(filename_read):
    business3_reviews_df['processed_text'] = business3_reviews_df['text'].apply(preprocess_text)
    business3_reviews_df = business3_reviews_df.drop('text', axis=1)
    filename_write = os.path.join(path, "df_business3_reviews_processed.csv")
    business3_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
else:
    filename_read = os.path.join(path, "df_business3_reviews_processed.csv")
    business3_reviews_df = pd.read_csv(filename_read)
print(business3_reviews_df.head())

              business_id  stars     name  \
0  v_vqna00z6WqKcIJZDkbAw      5  C Nails   
1  v_vqna00z6WqKcIJZDkbAw      1  C Nails   
2  v_vqna00z6WqKcIJZDkbAw      5  C Nails   
3  v_vqna00z6WqKcIJZDkbAw      5  C Nails   
4  v_vqna00z6WqKcIJZDkbAw      5  C Nails   

                                      processed_text  
0  looking new salon westchase area reading revie...  
1  horrible service go another salon next day get...  
2  great gel manicure current one going week stil...  
3  gotten shellac manicure far three time though ...  
4  reading review looking around found salon qt a...  


### *Process text of business4 reviews and save for model*

In [13]:
path = "./business_data/"
filename_read = os.path.join(path, "df_business4_reviews_processed.csv")
if not os.path.exists(filename_read):
    business4_reviews_df['processed_text'] = business4_reviews_df['text'].apply(preprocess_text)
    business4_reviews_df = business4_reviews_df.drop('text', axis=1)
    filename_write = os.path.join(path, "df_business4_reviews_processed.csv")
    business4_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
else:
    filename_read = os.path.join(path, "df_business4_reviews_processed.csv")
    business4_reviews_df = pd.read_csv(filename_read)
print(business4_reviews_df.head())

              business_id  stars                       name  \
0  zsQ1_PNV3KN0EWhAE-WV9g      3  Tohono Chul Garden Bistro   
1  zsQ1_PNV3KN0EWhAE-WV9g      4  Tohono Chul Garden Bistro   
2  zsQ1_PNV3KN0EWhAE-WV9g      5  Tohono Chul Garden Bistro   
3  zsQ1_PNV3KN0EWhAE-WV9g      3  Tohono Chul Garden Bistro   
4  zsQ1_PNV3KN0EWhAE-WV9g      5  Tohono Chul Garden Bistro   

                                      processed_text  
0  youre looking restaurant bring town guest good...  
1  decided try recentlyand thing looking kitchen ...  
2  came across brunch place absolutely loved scen...  
3  great atmosphere food flavorful older hostess ...  
4  favorite brunch place sure prickly pear mimosa...  


### *Process text of business5 reviews and save for model*

In [14]:
path = "./business_data/"
filename_read = os.path.join(path, "df_business5_reviews_processed.csv")
if not os.path.exists(filename_read):
    business5_reviews_df['processed_text'] = business5_reviews_df['text'].apply(preprocess_text)
    business5_reviews_df = business5_reviews_df.drop('text', axis=1)
    filename_write = os.path.join(path, "df_business5_reviews_processed.csv")
    business5_reviews_df.to_csv(filename_write, index=False, encoding='utf-8')
    print("Wrote file to {}".format(filename_write))
else:
    filename_read = os.path.join(path, "df_business5_reviews_processed.csv")
    business5_reviews_df = pd.read_csv(filename_read)
print(business5_reviews_df.head())

              business_id  stars            name  \
0  v2L2HnZzYvHPgFcVBg2TUw      1  Summerland Inn   
1  v2L2HnZzYvHPgFcVBg2TUw      1  Summerland Inn   
2  v2L2HnZzYvHPgFcVBg2TUw      5  Summerland Inn   
3  v2L2HnZzYvHPgFcVBg2TUw      3  Summerland Inn   
4  v2L2HnZzYvHPgFcVBg2TUw      5  Summerland Inn   

                                      processed_text  
0  stayed one night never come back toilet didnt ...  
1  thing wrong room large spider web outside room...  
2  like place got character degree outside weeken...  
3  place interesting cute outside cottage looking...  
4  summerland inn beautiful owner mayis sweet lad...  


### *Preprocess reviews*

In [18]:
df['preprocessed_text'] = df['text'].apply(preprocess_text) # apply preprocess_text function to text column
df = df.drop('text', axis=1)                                # drop original text column
print(df.head())

   stars                                  preprocessed_text
0      3  decide eat aware going take hour beginning end...
1      2  second time tried turning point location first...
2      4  place cute staff friendly nice menu good brunc...
3      3  came saturday morning waiting month opening ho...
4      2  mediocre best decor nice like restaurant tryin...


### *Save preprocessed dataframe*

In [19]:
path = "./data/"
filename_write = os.path.join(path, "df_preprocessed.csv")
df.to_csv(filename_write, index=False, encoding='utf-8') # using default encoding also worked -> used for next cell but wasn't helpful
print("Wrote file to {}".format(filename_write))

Wrote file to ./data/df_preprocessed.csv


### *Optional start point*

In [20]:
# Havent figured this out yet.  I tried loading the csv file but I get an error in the next cell.  Tried different encoding but didn't work either.  Will try again later maybe.
# Would be a nice starting point for the next step of the project because preprocessing take awhile.
'''path = "./data/"
filename_read = os.path.join(path, "df_preprocessed.csv")
df = pd.read_csv(filename_read)'''

'path = "./data/"\nfilename_read = os.path.join(path, "df_preprocessed.csv")\ndf = pd.read_csv(filename_read)'

### *Vectorize reviews*

In [27]:
vectorizer = sk_text.TfidfVectorizer(max_features=400, dtype=np.float32) # can adjust max_features if encounter memory issues; dtype to reduce memory usage -> defaults to float64
corpus = df['preprocessed_text']                                         # put preprocessed text into corpus
matrix = vectorizer.fit_transform(corpus)                                # fit and transform the corpus
tfidf_data = matrix.toarray()                                            # convert matrix to array
print('shape:', tfidf_data.shape)
print(tfidf_data)


shape: (6146631, 400)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### *Feature names*

In [28]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['able' 'absolutely' 'actually' 'almost' 'also' 'always' 'amazing'
 'amount' 'another' 'anyone' 'anything' 'appetizer' 'area' 'around'
 'arrived' 'ask' 'asked' 'atmosphere' 'attentive' 'away' 'awesome' 'back'
 'bacon' 'bad' 'bar' 'bartender' 'bbq' 'bean' 'beautiful' 'beef' 'beer'
 'best' 'better' 'big' 'bit' 'bite' 'bowl' 'bread' 'breakfast' 'bring'
 'brought' 'brunch' 'burger' 'business' 'busy' 'cake' 'call' 'called'
 'came' 'cant' 'car' 'care' 'check' 'cheese' 'chicken' 'chip' 'chocolate'
 'choice' 'city' 'clean' 'close' 'cocktail' 'coffee' 'cold' 'come'
 'coming' 'cooked' 'cool' 'could' 'couldnt' 'couple' 'course' 'crab'
 'cream' 'customer' 'cut' 'day' 'deal' 'decent' 'decided' 'definitely'
 'delicious' 'dessert' 'didnt' 'different' 'dining' 'dinner'
 'disappointed' 'dish' 'doesnt' 'dog' 'done' 'dont' 'door' 'drink' 'eat'
 'eating' 'egg' 'either' 'else' 'employee' 'end' 'enjoy' 'enjoyed'
 'enough' 'entree' 'especially' 'even' 'ever' 'every' 'everyone'
 'everything' 'excellent' 'expe

### *Concatenate stars and matrix into new dataframe*

In [29]:
assert len(df) == tfidf_data.shape[0], "Number of rows in dataframe does not match number of rows in matrix." # check number of rows in dataframe equals number of rows in tfidf matrix
df_data = pd.concat([df[['stars']], pd.DataFrame(tfidf_data)], axis=1)                                        # concatenate stars column with tfidf matrix
print(df_data.head())

   stars    0    1    2    3    4    5    6    7         8  ...  390  391  \
0      3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.168147  ...  0.0  0.0   
1      2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.0  0.0   
2      4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.207039  ...  0.0  0.0   
3      3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.128253  ...  0.0  0.0   
4      2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.0  0.0   

   392       393       394  395       396  397  398  399  
0  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0  
1  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0  
2  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0  
3  0.0  0.000000  0.090665  0.0  0.000000  0.0  0.0  0.0  
4  0.0  0.117374  0.000000  0.0  0.139791  0.0  0.0  0.0  

[5 rows x 401 columns]


### *Add featured names into dataframe*

In [30]:
df_data.columns = ['stars'] + feature_names.tolist()
print(df_data.head())

   stars  able  absolutely  actually  almost  also  always  amazing  amount  \
0      3   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
1      2   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
2      4   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
3      3   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
4      2   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   

    another  ...  wont  work  working     worth     would  wouldnt     wrong  \
0  0.168147  ...   0.0   0.0      0.0  0.000000  0.000000      0.0  0.000000   
1  0.000000  ...   0.0   0.0      0.0  0.000000  0.000000      0.0  0.000000   
2  0.207039  ...   0.0   0.0      0.0  0.000000  0.000000      0.0  0.000000   
3  0.128253  ...   0.0   0.0      0.0  0.000000  0.090665      0.0  0.000000   
4  0.000000  ...   0.0   0.0      0.0  0.117374  0.000000      0.0  0.139791   

   year  yet  youre  
0   0.0  0.0    0.0  


### *Save dataframe*

In [31]:
path = "./data/"
filename_write = os.path.join(path, "df_data.csv")
df_data.to_csv(filename_write, index=False)
print("Wrote file to {}".format(filename_write))

Wrote file to ./data/df_data.csv
