# Yelp review binary predictions

The task is to predict if review is positive or negative using bag-of-words model on [this dataset](https://www.kaggle.com/c/yelp-reviews)

In [3]:
import pandas as pd
import csv
import numpy as np

In [4]:
filename = 'yelp_reviews_train.csv'

In [5]:
n_lines = sum(1 for row in csv.reader(open(filename)))

In [6]:
n_lines -= 1
n_lines

4084562

In [35]:
n_needed_lines = int(n_lines * 0.05)
n_needed_lines

204228

In [36]:
import random

In [37]:
# skip = sorted(random.sample(range(1, n_lines+1), n_lines-n_needed_lines))

In [38]:
skip = n_lines - n_needed_lines
skip

3880334

In [39]:
reviews_df = pd.read_csv(filename, sep=",",
                         engine="c",
                         header=None,
                         names=['id', 'text', 'is_positive'],
                         dtype={
                             'id': np.int32,
                             'text': str,
                             'is_positive': np.int8
                         },
                         skiprows=skip)

In [40]:
reviews_df.shape

(204229, 3)

In [41]:
print(reviews_df.columns)
print(reviews_df.head())

Index(['id', 'text', 'is_positive'], dtype='object')
        id                                               text  is_positive
0  3880334  Just had a very good meal here last night. It'...            1
1  3880335  I had the pleasure of working with Ritchie car...            1
2  3880336  This place has some of the best burgers hands ...            1
3  3880337  My favorite coffee place for studying in the n...            1
4  3880338  Location is perfectly in plain site of Davisvi...            1


In [42]:
Y = reviews_df['is_positive']
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: is_positive, dtype: int8

In [43]:
X = reviews_df.drop('is_positive', axis=1)
X.head()['text']

0    Just had a very good meal here last night. It'...
1    I had the pleasure of working with Ritchie car...
2    This place has some of the best burgers hands ...
3    My favorite coffee place for studying in the n...
4    Location is perfectly in plain site of Davisvi...
Name: text, dtype: object

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso, LassoCV
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
#     ('clf', KNeighborsClassifier())
#     ('clf', SGDClassifier())
    ('clf', RidgeClassifier(solver='sag', alpha=1.0))
])

In [None]:
min_df_options = [1, 2]#, 5]
# n_neighbours_options = [5, 7, 10, 17, 20, 27]
param_grid = {
    'vect__min_df': min_df_options,
    'vect__use_idf': (True, False),
#     'clf__n_neighbors': n_neighbours_options
#     'clf__alpha': (0.00001, 0.000001),
#     'clf__penalty': ('l2', 'elasticnet'),
#     'clf__alpha': (0.1, 0.5, 1.0, 5.0, 10.0),
#     'clf__solver': ('svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga')
#     'clf__class_weight': ({0: 0.3, 1: 0.7}, {0: 0.5, 1: 0.5})
}
# vectorizer = CountVectorizer()
# tfidf = TfidfVectorizer(use_idf=False)
# termdoc_matrix = vectorizer.fit_transform(X['text'])
# termdoc_tfidf = tfidf.fit_transform(X['text'])
# print(termdoc_matrix.shape, termdoc_tfidf.shape, Y.shape)

In [None]:
from sklearn.metrics import roc_auc_score
def score_auc(estimator, X, y):
    y_score = estimator.predict_proba(X)
    return roc_auc_score(y, y_score)

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', iid=False)

In [None]:
# grid.fit(x_train['text'], y_train)
# grid.fit(termdoc_tfidf, Y)
grid.fit(X['text'], Y)

In [23]:
print(grid.best_estimator_)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ntercept=True, max_iter=None, normalize=False,
        random_state=None, solver='sag', tol=0.001))])


In [24]:
grid.best_params_

{'clf__alpha': 1.0,
 'clf__class_weight': {0: 0.5, 1: 0.5},
 'vect__min_df': 5,
 'vect__use_idf': True}

In [48]:
pipeline_cv = Pipeline([
    ('vect', TfidfVectorizer(min_df=3, use_idf=False)),
#     ('clf', SGDClassifier(alpha=1e-05, penalty='l2'))
#     ('clf', RidgeClassifier(solver='sag', alpha=1.0, class_weight={0: 0.5, 1: 0.5}))
    ('clf', Lasso())
])

In [49]:
pipeline_cv.fit(X['text'], Y)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...e=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [50]:
test_filename = 'yelp_review_test.csv'
test_df = pd.read_csv(test_filename,
                      sep=",",
                      engine="c",
                      dtype={'id': np.int32, 'text': str})

In [51]:
test_df.head()

Unnamed: 0,id,text
0,4084563,Elite A/V did an amazing job installing two 65...
1,4084564,I've been searching for a great haircutter sin...
2,4084565,I am going to start with how horrible my exper...
3,4084566,I got the Maki and Tempura dinner for $20.00 a...
4,4084567,This place is okay... just a basic breakfast. ...


In [52]:
test_predictions = pipeline_cv.predict(test_df['text'])
test_submission = pd.concat([test_df['id'], pd.Series(data=test_predictions)], axis=1, keys=['id', 'is_positive'])
test_submission.head()

Unnamed: 0,id,is_positive
0,4084563,0.738955
1,4084564,0.738955
2,4084565,0.738955
3,4084566,0.738955
4,4084567,0.738955


In [56]:
tt = pd.concat([test_df['id'], pd.Series(data=[1 if v >= 0.5 else 0 for v in test_predictions])], axis=1, keys=['id', 'is_positive'])
print(tt.head())
tt.to_csv('ag_submission5.csv', sep=',')

        id  is_positive
0  4084563            1
1  4084564            1
2  4084565            1
3  4084566            1
4  4084567            1


{0.7389548007383868}

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
sparse_matrix = count_vect.fit_transform(x_train['text'])
sparse_matrix.shape

(163383, 92350)

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

In [19]:
tfidf_transformer = TfidfTransformer(use_idf=False)
sparse_matrix_tfidf = tfidf_transformer.fit_transform(sparse_matrix)
sparse_matrix_tfidf.shape

(163383, 92350)

In [20]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(sparse_matrix_tfidf, y_train)

In [23]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(MultinomialNB(), param_grid={'alpha': [1.0, 2.0, 3.0]}, cv=None)
grid.fit(sparse_matrix_tfidf, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1.0, 2.0, 3.0]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [33]:
best_cv_err = 1 - grid.best_score_
best_n_neighbours = grid.best_estimator_.alpha
print(best_cv_err, best_n_neighbours)

clf_smart = MultinomialNB(alpha=grid.best_estimator_.alpha).fit(sparse_matrix_tfidf, y_train)
err_train_smart = np.mean(y_train != clf_smart.predict(sparse_matrix_tfidf))

0.21755629410648603 1.0


In [34]:
test_filename = 'yelp_review_test.csv'

81517

In [35]:
test_df = pd.read_csv(test_filename,
                      sep=",",
                      engine="c",
                      dtype={'id': np.int32, 'text': str})

In [36]:
print(test_df.columns)
print(test_df.head())

Index(['id', 'text'], dtype='object')
        id                                               text
0  4084563  Elite A/V did an amazing job installing two 65...
1  4084564  I've been searching for a great haircutter sin...
2  4084565  I am going to start with how horrible my exper...
3  4084566  I got the Maki and Tempura dinner for $20.00 a...
4  4084567  This place is okay... just a basic breakfast. ...


In [37]:
test_tfidf = tfidf_transformer.transform(count_vect.transform(test_df['text']))
test_tfidf.shape

(81516, 92350)

In [38]:
test_predictions = clf_smart.predict(test_tfidf)
test_predictions

array([1, 1, 1, ..., 1, 1, 0], dtype=int8)

In [39]:
clf_smart = MultinomialNB().fit(sparse_matrix_tfidf, y_train)

In [40]:
test_predictions = clf_smart.predict(test_tfidf)
test_predictions

array([1, 1, 1, ..., 1, 1, 0], dtype=int8)

In [42]:
print(test_df['id'].shape, test_predictions.shape)

(81516,) (81516,)


In [44]:
test_submission = pd.concat([test_df['id'], pd.Series(data=test_predictions)], axis=1, keys=['id', 'is_positive'])
test_submission.head()

Unnamed: 0,id,is_positive
0,4084563,1
1,4084564,1
2,4084565,1
3,4084566,1
4,4084567,1


In [45]:
test_submission.to_csv('ag_submission.csv', sep=',')

In [49]:
test1_count_vect = CountVectorizer()
test1_sparse_matrix = test1_count_vect.fit_transform(X['text'])
print(test1_sparse_matrix.shape)

test1_tfidf = TfidfTransformer(use_idf=False)
test1_sparse_matrix_tfidf = test1_tfidf.fit_transform(test1_sparse_matrix)
test1_sparse_matrix_tfidf.shape

(204229, 102759)


(204229, 102759)

In [56]:
from sklearn.linear_model import Lasso
lasso_clf = Lasso(normalize=True).fit(test1_sparse_matrix_tfidf, Y)

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
test1_tfidf2 = TfidfVectorizer().fit(X['text'])
test1_tdata = test1_tfidf2.transform(test_df['text'])

test1_pred = lasso_clf.predict(test1_tdata)
test1_subm = pd.concat([test_df['id'], pd.Series(data=test1_pred)], axis=1, keys=['id', 'is_positive'])
test1_subm.to_csv('ag_submission2.csv', sep=',')

In [58]:
test1_pred[:10]

array([0.7389548, 0.7389548, 0.7389548, 0.7389548, 0.7389548, 0.7389548,
       0.7389548, 0.7389548, 0.7389548, 0.7389548])