# Yelp review binary predictions

The task is to predict if review is positive or negative using bag-of-words model on [this dataset](https://www.kaggle.com/c/yelp-reviews)

In [1]:
import pandas as pd
import csv
import numpy as np

In [2]:
filename = 'yelp_reviews_train.csv'

In [3]:
n_lines = sum(1 for row in csv.reader(open(filename)))

In [4]:
n_lines -= 1
n_lines

4084562

In [5]:
n_needed_lines = int(n_lines * 0.05)
n_needed_lines

204228

In [6]:
import random

In [7]:
# skip = sorted(random.sample(range(1, n_lines+1), n_lines-n_needed_lines))

In [8]:
skip = n_lines - n_needed_lines
skip

3880334

In [9]:
reviews_df = pd.read_csv(filename, sep=",",
                         engine="c",
                         header=None,
                         names=['id', 'text', 'is_positive'],
                         dtype={
                             'id': np.int32,
                             'text': str,
                             'is_positive': np.int8
                         },
                         skiprows=skip)

In [10]:
reviews_df.shape

(204229, 3)

In [11]:
print(reviews_df.columns)
print(reviews_df.head())

Index(['id', 'text', 'is_positive'], dtype='object')
        id                                               text  is_positive
0  3880334  Just had a very good meal here last night. It'...            1
1  3880335  I had the pleasure of working with Ritchie car...            1
2  3880336  This place has some of the best burgers hands ...            1
3  3880337  My favorite coffee place for studying in the n...            1
4  3880338  Location is perfectly in plain site of Davisvi...            1


In [12]:
Y = reviews_df['is_positive']

In [13]:
X = reviews_df.drop('is_positive', axis=1)

In [14]:
from sklearn.model_selection import train_test_split
# TODO: add train/test split to improve classifier
# constant random_state to have repeatability
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=27)



In [15]:
x_train.head()

Unnamed: 0,id,text
39800,3920134,Beware this medical group practices defensive ...
44889,3925223,I really enjoy the concept behind massage envy...
12588,3892922,Excellent experience! Just moved here and ne...
30669,3911003,I drove by this place everyday and finally dec...
81643,3961977,I have come to Kelly's for years. But today wa...


In [16]:
y_train.head()

39800    0
44889    1
12588    1
30669    1
81643    1
Name: is_positive, dtype: int8

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
sparse_matrix = count_vect.fit_transform(x_train['text'])
sparse_matrix.shape

(163383, 92350)

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

In [19]:
tfidf_transformer = TfidfTransformer(use_idf=False)
sparse_matrix_tfidf = tfidf_transformer.fit_transform(sparse_matrix)
sparse_matrix_tfidf.shape

(163383, 92350)

In [20]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(sparse_matrix_tfidf, y_train)

In [23]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 2.0, 3.0]}, cv=None)
grid.fit(sparse_matrix_tfidf, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1.0, 2.0, 3.0]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [33]:
best_cv_err = 1 - grid.best_score_
best_n_neighbours = grid.best_estimator_.alpha
print(best_cv_err, best_n_neighbours)

clf_smart = MultinomialNB(alpha=grid.best_estimator_.alpha).fit(sparse_matrix_tfidf, y_train)
err_train_smart = np.mean(y_train != clf_smart.predict(sparse_matrix_tfidf))

0.21755629410648603 1.0


In [34]:
test_filename = 'yelp_review_test.csv'
n_lines_test = sum(1 for row in csv.reader(open(test_filename)))
n_lines_test

81517

In [35]:
test_df = pd.read_csv(test_filename,
                      sep=",",
                      engine="c",
                      dtype={'id': np.int32, 'text': str})

In [36]:
print(test_df.columns)
print(test_df.head())

Index(['id', 'text'], dtype='object')
        id                                               text
0  4084563  Elite A/V did an amazing job installing two 65...
1  4084564  I've been searching for a great haircutter sin...
2  4084565  I am going to start with how horrible my exper...
3  4084566  I got the Maki and Tempura dinner for $20.00 a...
4  4084567  This place is okay... just a basic breakfast. ...


In [37]:
test_tfidf = tfidf_transformer.transform(count_vect.transform(test_df['text']))
test_tfidf.shape

(81516, 92350)

In [38]:
test_predictions = clf_smart.predict(test_tfidf)
test_predictions

array([1, 1, 1, ..., 1, 1, 0], dtype=int8)

In [39]:
clf_smart = MultinomialNB().fit(sparse_matrix_tfidf, y_train)

In [40]:
test_predictions = clf_smart.predict(test_tfidf)
test_predictions

array([1, 1, 1, ..., 1, 1, 0], dtype=int8)

In [42]:
print(test_df['id'].shape, test_predictions.shape)

(81516,) (81516,)


In [44]:
test_submission = pd.concat([test_df['id'], pd.Series(data=test_predictions)], axis=1, keys=['id', 'is_positive'])
test_submission.head()

Unnamed: 0,id,is_positive
0,4084563,1
1,4084564,1
2,4084565,1
3,4084566,1
4,4084567,1


In [45]:
test_submission.to_csv('ag_submission.csv', sep=',')