# Yelp Data Challenge - NLP

BitTiger DS501

Jun 2017

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from sklearn.cross_validation import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.model_selection import cross_val_score
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
#from sklearn.cluster import KMeans
#from sklearn.metrics import silhouette_score

In [None]:
plt.style.use("ggplot")

In [None]:
df = pd.read_csv('data/last_2_years_restaurant_reviews.csv')

In [None]:
df.head()

### Define your feature variables, here is the text of the review

In [None]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df.text.values

In [None]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
df['perfection'] = df['stars'].apply(lambda x : int(x == 5))

### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [None]:
# Make a column and take the values, save to a variable named "target"
target = df['perfection'].values

#### You may want to look at the statistic of the target variable

In [None]:
# To be implemented
print ("mean:{}".format(target.mean()))

## Let's create training dataset and test dataset

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
# Documents is your X, target is your y
# Now split the data to training set and test set
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, 
	test_size=0.3, random_state=11)

In [None]:
# Split to documents_train, documents_test, target_train, target_test
pass

## Let's get NLP representation of the documents

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = "english", max_features = 5000)

In [None]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [None]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [None]:
# Use the trained model to transform your test data
vectors_test = vectorizer.transform(documents_test).toarray()
vectors_all = vectorizer.transform(documents).toarray()

## Similar review search engine

In [None]:
# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    pass  # To be implemented


In [None]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Draw an arbitrary review from test (unseen in training) documents
pass

In [None]:
# Transform the drawn review(s) to vector(s)
pass

In [None]:
# Calculate the similarity score(s) between vector(s) and training vectors
pass

In [None]:
# Let's find top 5 similar reviews
n = 5
pass

In [None]:
print('Our search query:')
print() # To be added

In [None]:
print('Most %s similar reviews:' % n)
print()  # To be added

#### Q: Does the result make sense to you?

A: I think the similar reviews make sense.

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [None]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
score_nb = cross_val_score(nb, vectors_train, target_train, cv=5)

In [None]:
# Get score for training set
print ('nb cv score:',score_nb)

In [None]:
# Get score for test set
pass

#### Logistic Regression Classifier

In [None]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

param_grid = [
  {'C': [0.1,1, 10, 100], 'penalty': ['l1','l2']}
 ]
lr_gridsearch = GridSearchCV(estimator=LogisticRegression(random_state=1),param_grid=param_grid,scoring='roc_auc', cv = 10)
lr_gridsearch.fit(vectors_train, target_train)

In [None]:
# Get score for training set
print ('best params:',lr_gridsearch.best_params_)

In [None]:
# Get score for test set
print ('best score:',lr_gridsearch.best_score_)

#### Q: What are the key features(words) that make the positive prediction?

In [None]:
# Let's find it out by ranking
n = 20

lr = LogisticRegression(C=1, penalty='l1')
lr.fit(vectors_train, target_train)
target_test_pred = lr.predict(vectors_test)

def print_results(y_true, y_pred):
    print("Accuracy of the Logistic Regression is: {}".format(accuracy_score(y_true, y_pred)))
    print("Precision of the Logistic Regression is: {}".format(precision_score(y_true, y_pred)))
    print("Recall of the Logistic Regression is: {}".format(recall_score(y_true, y_pred)))
    print("f1-score of the Logistic Regression is: {}".format(f1_score(y_true, y_pred)))
    print("auc score of the Logistic Regression is: {}".format(roc_auc_score(y_true, y_pred)))

print("Test set scores:")
print_results(target_test, target_test_pred)

A: (insert your comments here)

#### Q: What are the key features(words) that make the negative prediction?

In [None]:
# Let's find it out by ranking
n = 20
pass

A: (insert your comments here)

#### Random Forest Classifier

In [None]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

pass

In [None]:
# Get score for training set
pass

In [None]:
# Get score for test set
pass

#### Q: What do you see from the training score and the test score?

A: The testing result are kind of overfitting.

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [None]:
n = 20
pass

## Extra Credit #1: Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [None]:
# To be implemented
from sklearn.model_selection import cross_val_score

lr = LogisticRegression()
score_lr = cross_val_score(lr, vectors_train, target_train, cv=5)
print ('lr cv score:',score_lr)

nb = MultinomialNB()
score_nb = cross_val_score(nb, vectors_train, target_train, cv=5)
print ('nb cv score:',score_nb)

## Extra Credit #2: Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [None]:
# To be implemented