In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.model_selection import train_test_split
import seaborn as sns
import re

# Load dataset

In [None]:
df_business = pd.read_json("./business.json", lines=True)

#Take 100K records from the dataset
df_business = df_business.iloc[0:100000, :]

In [2]:
df_review = pd.read_json("./review75k.json", lines=True)
#Take 50K records from the dataset
df_review = df_review.iloc[0:50000, :]

In [None]:
print(df_business.shape)
df_business.head()

In [3]:
print(df_review.shape)
df_review.head()

(50000, 9)


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1,6,1,0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36
1,GJXCdrto3ASJOqKeVWPi6Q,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5,0,0,0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33
2,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5,3,0,0,I have to say that this office really has it t...,2016-11-09 20:09:03
3,yi0R0Ugj_xUx_Nek0-_Qig,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5,0,0,0,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38
4,11a8sVPMUFtaC7_ABRkmtw,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1,7,0,0,Today was my second out of three sessions I ha...,2018-01-30 23:07:38


# Missing value this shit bugs me so much OMG

In [25]:
df_business.isnull().sum()

business_id        0
name               0
address            0
city               0
state              0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
attributes      1121
categories         0
hours           7304
dtype: int64

**Duplicate business_id**

In [34]:
df_business.loc[:,'business_id'].duplicated().sum()

0

# Data Cleaning

## Business

**Remove quotation marks which can cause parsing problem**

In [13]:
df_business['name'] = df_business['name'].str.replace('"', '')
df_business['address'] = df_business['address'].str.replace('"', '')

**For the sake of simplicity, I prefer to analyse datas related to restaurants first**

In [18]:
df_restaurant = df.loc[df['categories'].str.contains('Restaurants') == True]

**Redefinied the category of each restaurant**

In [19]:
df_restaurant['category'] = pd.Series()

In [21]:
df_restaurant.loc[df_restaurant.categories.str.contains('American'), 'category'] = 'American'
df_restaurant.loc[df_restaurant.categories.str.contains('Mexican'), 'category'] = 'Mexican'
df_restaurant.loc[df_restaurant.categories.str.contains('Italian'), 'category'] = 'Italian'
df_restaurant.loc[df_restaurant.categories.str.contains('Japanese'), 'category'] = 'Japanese'
df_restaurant.loc[df_restaurant.categories.str.contains('Chinese'), 'category'] = 'Chinese'
df_restaurant.loc[df_restaurant.categories.str.contains('Vietnamese'), 'category'] = 'Vietnamese'
df_restaurant.loc[df_restaurant.categories.str.contains('Thai'), 'category'] = 'Thai'
df_restaurant.loc[df_restaurant.categories.str.contains('Indian'), 'category'] = 'Indian'
df_restaurant.loc[df_restaurant.categories.str.contains('French'), 'category'] = 'French'
df_restaurant.loc[df_restaurant.categories.str.contains('African'), 'category'] = 'African'
df_restaurant.loc[df_restaurant.categories.str.contains('Spanish'), 'category'] = 'Spanish'
df_restaurant.loc[df_restaurant.categories.str.contains('Greek'), 'category'] = 'Greek'
df_restaurant.loc[df_restaurant.categories.str.contains('Mediterranean'), 'category'] = 'Mediterranean'
df_restaurant.loc[df_restaurant.categories.str.contains('Middle_eastern'), 'category'] = 'Middle_eastern'
df_restaurant.loc[df_restaurant.categories.str.contains('Korean'), 'category'] = 'Korean'
df_restaurant.loc[df_restaurant.categories.str.contains('Hawaiian'), 'category'] = 'Hawaiian'

In [26]:
df_restaurant.isnull().sum()

business_id         0
name                0
address             0
city                0
state               0
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes       1121
categories          0
hours            7304
category        13435
dtype: int64

In [28]:
#Drop null values in category
df_restaurant = df_restaurant.dropna(axis=0, subset=['category'])

In [30]:
df_restaurant.shape

(17332, 15)

In [31]:
df_restaurant.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,category
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",Chinese
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",Japanese
11,1Dfx3zM-rW4n-31KeC8sJg,Taco Bell,2450 E Indian School Rd,Phoenix,AZ,85016,33.495194,-112.028588,3.0,18,1,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...","Restaurants, Breakfast & Brunch, Mexican, Taco...","{'Monday': '7:0-0:0', 'Tuesday': '7:0-0:0', 'W...",Mexican
13,fweCYi8FmbJXHCqLnwuk8w,Marco's Pizza,5981 Andrews Rd,Mentor-on-the-Lake,OH,44060,41.70852,-81.359556,4.0,16,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Italian, Restaurants, Pizza, Chicken Wings","{'Monday': '10:0-0:0', 'Tuesday': '10:0-0:0', ...",Italian
17,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,"1775 E Tropicana Ave, Ste 29",Las Vegas,NV,89119,36.100016,-115.128529,4.0,40,0,"{'OutdoorSeating': 'False', 'BusinessAcceptsCr...","Restaurants, Italian",,Italian


## Review

In [16]:
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
#REPLACE_NEW_LINE = re.compile("\s*\n+\s*")
#REPLACE_NO_SPACE = re.compile('[.;:!\'?,\"()\[\]]')
#stopword = stopwords.words('english')

## Text Preprocessing

In [None]:
from nltk.corpus import stopwords
import re

In [123]:
#Lower all text
def toLower(text):
    return text.lower()

#Replace all new line, basically text's shown in the same paragraph
def noNewLine(text):
    REPLACE_NEW_LINE = re.compile('\s*\n+\s*')
    text = REPLACE_NEW_LINE.sub(' ', text)
    return text

#Weird character as the title said
def weirdCharacter(text):
    REPLACE_NO_SPACE = re.compile('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]')
    text = REPLACE_NO_SPACE.sub('', text)
    return text

#Word tokenizer
def tokenizing(text):
    return nltk.word_tokenize(text)

#Remove stopword
def removeStopWords(tokens):
    swords = stopwords.words('english')
    tokens = [token for token in tokens if token not in swords]
    return tokens
    
#Remove empty token
#This can happen when parse double whitespace or some unexpected comment
def removeEmptyToken(tokens):
    tokens = [token for token in tokens if len(token) > 0]
    return tokens

#Join token together into text paragraph
def joinToken(tokens):
    return ' '.join(tokens)

**Combine all text preprocessing methods**

In [124]:
def reformat_text(text):
    text = noNewLine(text)
    text = toLower(text)
    text = weirdCharacter(text)
    tokens = tokenizing(text)
    tokens = removeStopWords(tokens)
    tokens = removeEmptyToken(tokens)
    return tokens

In [125]:
#def testReformat(text):
#    text = noNewLine(text)
#    text = toLower(text)
#    text = weirdCharacter(text)
#    tokens = tokenizing(text)
#    return tokens

# Text Normalization

## Lemmatizer

- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: “there is” … think of it like “there exists”)
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective ‘big’
- JJR adjective, comparative ‘bigger’
- JJS adjective, superlative ‘biggest’
- LS list marker 1
- MD modal could, will
- NN noun, singular ‘desk’
- NNS noun plural ‘desks’
- NNP proper noun, singular ‘Harrison’
- NNPS proper noun, plural ‘Americans’
- PDT predeterminer ‘all the kids’
- POS possessive ending parent’s
- PRP personal pronoun I, he, she
- PRP possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO, to go ‘to’ the store.
- UH interjection, errrrrrrrm
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP possessive wh-pronoun whose
- WRB wh-abverb where, when**

In [222]:
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

**POS (Part Of Speech) tagging** \
Assign a tag to every word to define if it corresponds to a noun, a verb, etc using Wordnet lexical database

In [228]:
def lemmatize_sentence(tokens):
    #Incase some idiots dont parse a tokens in the parameter
    if not isinstance(tokens, (list, tuple)):
        tokens = tokenizing(tokens)
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        print(tag)
        if tag.startswith('NN' ):
            pos = wordnet.NOUN 
        elif tag.startswith('VB'):
            pos = wordnet.VERB
        elif tag.startswith('RB'):
            pos = wordnet.ADV
        elif tag.startswith('JJ'):
            pos = wordnet.ADJ
        else:
            pos = wordnet.NOUN
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [234]:
#text = 'I missed the moment we spent together with her, there is still flashbacks of her in my life'
#lemmatize_sentence(text)

# Data preparation for training model

In [133]:
def clean_text(text):
    tokens = reformat_text(text)
    tokens = lemmatize_sentence(tokens)
    text = joinToken(tokens)
    return text

**Apply text processing for the review column**

In [134]:
df_review['review_clean'] = df_review['text'].apply(lambda x: clean_text(x))

**Label outcome base on the star column**

In [154]:
df_review['outcome'] = pd.Series()
df_review.loc[df_review['stars'] > 3, 'outcome'] = 2
df_review.loc[df_review['stars'] == 3, 'outcome'] = 1
df_review.loc[df_review['stars'] < 3, 'outcome'] = 0

In [155]:
df_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,review_clean,outcome
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1,6,1,0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36,total bill horrible service 8gs crook actually...,0.0
1,GJXCdrto3ASJOqKeVWPi6Q,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5,0,0,0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33,adore travis hard rock 's new kelly cardenas s...,2.0
2,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5,3,0,0,I have to say that this office really has it t...,2016-11-09 20:09:03,say office really together organize friendly d...,2.0
3,yi0R0Ugj_xUx_Nek0-_Qig,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5,0,0,0,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38,go lunch steak sandwich delicious caesar salad...,2.0
4,11a8sVPMUFtaC7_ABRkmtw,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1,7,0,0,Today was my second out of three sessions I ha...,2018-01-30 23:07:38,today second three session pay although first ...,0.0


**Features and label**

In [196]:
X = np.array(df_review['review_clean'])
y = np.array(df_review['outcome'])

In [197]:
print(X.shape)
print(y.shape)

(50000,)
(50000,)


# Vectorization

There are 3 forms of vectorization:
- Count Vectorization (Same behavior as one hot encoding)
- n-grams
- TF-IDF

24/03: We test TF-IDF which is the most relevant, keep n-gram for later

## TF-IDF Vectorization

**Notes** \
Practically speaking TF-IDF requires the Bag of Words transformation, luckily for us the implementation of TF-IDF in Scikit-Learn does all of the steps neccessary

In [198]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [239]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviewFeature)
X = tfidf_vectorizer.transform(reviewFeature)

**Tunning eventually (For science)**

In [None]:
#max_features: Use n numbers of most frequently occuring words to create a bag of words
#max_df: words that occur in a maximum n(num/%) of the document
#min_df: words that occur in at least n(num/%) of the document

#tfidf_vectorizer = TfidfVectorizer(max_features=3000, max_df=0.8)

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Prediction Model

## Logistic Regression

In [245]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [254]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    # I dont know why the fuck Sklearn wants to do this reshape thing but here we are
    #y_test = y_test.reshape(1, -1)
    #y_pred = y_pred.reshape(1, -1)
    
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_test, y_pred)))
    
    print(confusion_matrix(y_test,y_pred))



Accuracy for C=0.01: 0.66528
[[  42    0 2779]
 [   1    0 1404]
 [   0    0 8274]]
Accuracy for C=0.05: 0.7604
[[1278    0 1543]
 [ 113    1 1291]
 [  48    0 8226]]
Accuracy for C=0.25: 0.82264
[[2068   35  718]
 [ 239  102 1064]
 [ 138   23 8113]]
Accuracy for C=0.5: 0.83704
[[2218   59  544]
 [ 273  174  958]
 [ 153   50 8071]]
Accuracy for C=1: 0.84336
[[2273   80  468]
 [ 301  230  874]
 [ 158   77 8039]]


## Support Vector Machine (Linear)

In [251]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [255]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_test, y_pred)))
    
    print(confusion_matrix(y_test,y_pred))

Accuracy for C=0.01: 0.79288
[[1707    2 1112]
 [ 184   17 1204]
 [  84    3 8187]]
Accuracy for C=0.05: 0.83576
[[2235   43  543]
 [ 298  117  990]
 [ 155   24 8095]]
Accuracy for C=0.25: 0.84624
[[2338   87  396]
 [ 331  243  831]
 [ 184   93 7997]]
Accuracy for C=0.5: 0.84496
[[2335  107  379]
 [ 345  272  788]
 [ 198  121 7955]]
Accuracy for C=1: 0.8408
[[2320  134  367]
 [ 348  310  747]
 [ 214  180 7880]]


## Radom Forest

In [248]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [250]:
rf = RandomForestClassifier(n_estimators=500, random_state=0)
rf.fit(X_train, y_train)
print ("Accuracy: %s" 
           % (accuracy_score(y_test, y_pred)))
    
print(confusion_matrix(y_test,y_pred))

Accuracy: 0.8408
[[2320  134  367]
 [ 348  310  747]
 [ 214  180 7880]]
