In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from textblob import TextBlob
from langdetect import detect
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/yishi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yishi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load data
df_review = pd.read_csv('../Data/yelp_review.csv',index_col=False)

In [3]:
df_user_fields = ['user_id', 'review_count', 'friends', 'useful', 'funny', 'cool', 'average_stars']
df_user = pd.read_csv('../Data/yelp_user.csv',index_col=False, usecols=df_user_fields)
df_user.columns = ['user_id', 'user_review_count', 'friends', 'user_total_useful', 'total_funny', 
                   'total_cool', 'user_average_stars']
df = pd.merge(df_review, df_user, on='user_id', how='left')

df_business_fields = ['business_id', 'stars', 'review_count']
df_business = pd.read_csv('../Data/yelp_business.csv',index_col=False, usecols=df_business_fields)
df_business.columns = ['business_id', 'business_stars', 'business_review_count']
df = pd.merge(df, df_business, on='business_id',how='left')

In [4]:
df['user_review_count'] = df['user_review_count'] - 1
df['user_total_useful'] = df['user_total_useful'] - df['useful']
df['total_funny'] = df['total_funny'] - df['funny']
df['total_cool'] = df['total_cool'] - df['cool']

In [5]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,user_review_count,friends,user_total_useful,total_funny,total_cool,user_average_stars,business_stars,business_review_count
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0,5,4Zpn3ObC2zexkIhor4LJIQ,0,0,0,4.67,4.0,84
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0,5,4Zpn3ObC2zexkIhor4LJIQ,0,0,0,4.67,4.5,50
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0,5,4Zpn3ObC2zexkIhor4LJIQ,0,0,0,4.67,4.0,70
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0,5,4Zpn3ObC2zexkIhor4LJIQ,0,0,0,4.67,3.5,61
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0,5,4Zpn3ObC2zexkIhor4LJIQ,0,0,0,4.67,4.5,397


In [6]:
# Add a column to present the age of the comments
df['date'] = pd.to_datetime(df['date'])
today = datetime.strptime('7/23/2018', "%m/%d/%Y")
df['days'] = (today - df['date']).dt.days

In [7]:
# Using langdetect to detect language of the comments and only includes English comments
language=[] # list which will contain the language of the comments
for i in df.text.values:
    try:
        lan =detect(i)
        language.append(lan)
        
    except:
        language.append('unknown')

df['language']=language
df = df.loc[df['language']=='en']

In [8]:
# Add a column to present word count of the comments
df['text_count'] = df['text'].str.split().str.len()

In [9]:
# Add a column to present review sentiment
pol=[] # list which will contain the polarity of the comments
for i in df.text.values:
    try:
        analysis =TextBlob(i)
        pol.append(analysis.sentiment.polarity)
        
    except:
        pol.append(0)
       
df['pol']=pol

df['pol'] = df['pol'].apply(lambda x: round(x,2))

In [10]:
# Removing punctions
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    # replacing the punctuations with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)

df['text'] = df['text'].apply(remove_punctuation)

In [11]:
# Change every word into lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

In [12]:
# Removing stopwords
sw = stopwords.words('english')

def stopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word for word in text.split() if word not in sw]
    # joining the list of words with space separator
    return " ".join(text)

df['text'] = df['text'].apply(stopwords)

In [13]:
friends = []
for f in df['friends']:
    if f =='None':
        c = 0
        friends.append(c)
    if f !='None':
        f = str(f)
        c = f.count(',') + 1
        friends.append(c)
df['friends'] = friends

In [14]:
# Stemming operations
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

df['text'] = df['text'].apply(stemming)

In [15]:
# df['text'] = df['text'].apply(lambda x: str(TextBlob(x).correct()))

In [16]:
# df.drop(['funny', 'cool', 'language'], axis=1, inplace=True)

In [17]:
df.isnull().any()

review_id                False
user_id                  False
business_id              False
stars                    False
date                     False
text                     False
useful                   False
funny                    False
cool                     False
user_review_count        False
friends                  False
user_total_useful        False
total_funny              False
total_cool               False
user_average_stars       False
business_stars           False
business_review_count    False
days                     False
language                 False
text_count               False
pol                      False
dtype: bool

In [18]:
df['user_avg_useful'] = df['user_total_useful']/(df['user_review_count'] - 1)
df['user_avg_useful'] = df['user_avg_useful'].apply(lambda x: round(x,2))
df['user_avg_useful'] = df['user_avg_useful'].fillna(0)
df['user_avg_useful'] = df['user_avg_useful'].replace([np.inf, -np.inf], 0)
# df = df[np.isfinite(df['user_avg_useful'])]

In [19]:
df['user_review_count'] = df['user_review_count'].fillna(0)
df['user_total_useful'] = df['user_total_useful'].fillna(0)
df['total_funny'] = df['total_funny'].fillna(0)
df['total_cool'] = df['total_cool'].fillna(0)

In [21]:
df = df[pd.notnull(df['text'])]
df = df[pd.notnull(df['user_average_stars'])]
df = df.replace('', np.nan)
df = df.dropna(subset = ['text']) 

In [22]:
df.to_csv('../Cleaned_Data/cleaned_data_v1_full.csv', index=False)

In [None]:
df.head()