# Yelp review binary predictions

The task is to predict if review is positive or negative using bag-of-words model on [this dataset](https://www.kaggle.com/c/yelp-reviews)

In [1]:
import pandas as pd
import csv

In [2]:
filename = 'yelp_reviews_train.csv'

In [3]:
n_lines = sum(1 for row in csv.reader(open(filename)))

In [4]:
n_lines -= 1
n_lines

4084562

In [5]:
n_needed_lines = int(n_lines * 0.0005)
n_needed_lines

2042

In [6]:
import random

In [7]:
skip = sorted(random.sample(range(1, n_lines+1), n_lines-n_needed_lines))

In [8]:
reviews_df = pd.read_csv(filename, sep=",", engine="python", skiprows=skip)

In [13]:
reviews_df.shape

(2042, 3)

In [14]:
reviews_df.columns

Index(['id', 'text', 'is_positive'], dtype='object')

In [16]:
Y = reviews_df['is_positive']

In [18]:
X = reviews_df.drop('is_positive', axis=1)

In [20]:
from sklearn.model_selection import train_test_split
# TODO: add train/test split to improve classifier
# constant random_state to have repeatability
# x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=27)



In [39]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
sparse_matrix = count_vect.fit_transform(X['text'])
sparse_matrix.shape

(2042, 13720)

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer

In [41]:
tfidf_transformer = TfidfTransformer()
sparse_matrix_tfidf = tfidf_transformer.fit_transform(sparse_matrix)
sparse_matrix_tfidf.shape

(2042, 13720)

In [44]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(sparse_matrix_tfidf, Y)

In [45]:
test_filename = 'yelp_review_test.csv'
n_lines_test = sum(1 for row in csv.reader(open(test_filename)))
n_lines_test

81517

In [51]:
test_df = pd.read_csv(test_filename, sep=",", engine="python", nrows=int(n_needed_lines*0.1))

In [53]:
test_df.columns

Index(['id', 'text'], dtype='object')

In [55]:
test_tfidf = tfidf_transformer.transform(count_vect.transform(test_df['text']))
test_tfidf.shape

(204, 13720)

In [58]:
test_predictions = clf.predict(test_tfidf)
test_predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])