In [1]:
import sys
import json
import pandas as pd
import numpy as np
import string

In [2]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
with open('df_by_usr.json', 'r', encoding='UTF-8') as f:
    data = f.readlines()
    df_by_usr_data = list(map(json.loads, data))
    
df_by_usr = pd.DataFrame(df_by_usr_data)

In [4]:
train_data, test_data, train_target, test_target = train_test_split(df_by_usr['text_agg'],
                                                                    df_by_usr['reviewer_label'])

In [5]:
str_test_data = []
for text in test_data:
    str_test_data.append(str(text))

In [6]:
str_train_data = []
for text in train_data:
    str_train_data.append(str(text))

In [9]:
vectorizer = TfidfVectorizer(binary=True, ngram_range=(1,2), max_df=0.95) 
train_features = vectorizer.fit_transform(str_train_data)
test_features = vectorizer.transform(str_test_data)

pipe = Pipeline([('clf', LogisticRegression())])

grid_params = dict(clf__C=np.logspace(5, 9, 5),
                   clf__multi_class=['multinomial'],
                   clf__solver=['lbfgs'])

gs = GridSearchCV(estimator=pipe,
                  param_grid=grid_params,
                  scoring='accuracy',
                  cv=3,
                  verbose=2)



# pipe = Pipeline([('vect', TfidfVectorizer()),
#                  ('clf', LogisticRegression())])

# params = [{'vect__ngram_range' : [(1,1),(1,2)] ,
#            'vect__max_df' : np.arange(0.8,1.0,0.1),
#            'vect__norm' : ['l1','l2', None],
#            'vect__sublinear_tf' : [True, False],
#            'clf__C' : np.logspace(6, 9, 10),
#            #'clf__penalty' : ['l1','l2', None],
#            'clf__multi_class' : ['multinomial'],
#            'clf__solver' : ['lbfgs']}]

# gs = GridSearchCV(pipe, params, verbose=1)

In [10]:
gs.fit(train_features, train_target) 
print(gs.best_score_)
gs.best_estimator_.get_params()

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] clf__C=100000.0, clf__multi_class=multinomial, clf__solver=lbfgs 
[CV]  clf__C=100000.0, clf__multi_class=multinomial, clf__solver=lbfgs, total= 3.6min
[CV] clf__C=100000.0, clf__multi_class=multinomial, clf__solver=lbfgs 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.6min remaining:    0.0s


[CV]  clf__C=100000.0, clf__multi_class=multinomial, clf__solver=lbfgs, total= 3.7min
[CV] clf__C=100000.0, clf__multi_class=multinomial, clf__solver=lbfgs 
[CV]  clf__C=100000.0, clf__multi_class=multinomial, clf__solver=lbfgs, total= 3.6min
[CV] clf__C=1000000.0, clf__multi_class=multinomial, clf__solver=lbfgs 
[CV]  clf__C=1000000.0, clf__multi_class=multinomial, clf__solver=lbfgs, total= 3.8min
[CV] clf__C=1000000.0, clf__multi_class=multinomial, clf__solver=lbfgs 
[CV]  clf__C=1000000.0, clf__multi_class=multinomial, clf__solver=lbfgs, total= 3.6min
[CV] clf__C=1000000.0, clf__multi_class=multinomial, clf__solver=lbfgs 
[CV]  clf__C=1000000.0, clf__multi_class=multinomial, clf__solver=lbfgs, total= 3.6min
[CV] clf__C=10000000.0, clf__multi_class=multinomial, clf__solver=lbfgs 
[CV]  clf__C=10000000.0, clf__multi_class=multinomial, clf__solver=lbfgs, total= 3.4min
[CV] clf__C=10000000.0, clf__multi_class=multinomial, clf__solver=lbfgs 
[CV]  clf__C=10000000.0, clf__multi_class=mult

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 46.2min finished


0.7369902617163725


{'memory': None,
 'steps': [('clf',
   LogisticRegression(C=100000000.0, class_weight=None, dual=False,
             fit_intercept=True, intercept_scaling=1, max_iter=100,
             multi_class='multinomial', n_jobs=1, penalty='l2',
             random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
             warm_start=False))],
 'clf': LogisticRegression(C=100000000.0, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='multinomial', n_jobs=1, penalty='l2',
           random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
           warm_start=False),
 'clf__C': 100000000.0,
 'clf__class_weight': None,
 'clf__dual': False,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__max_iter': 100,
 'clf__multi_class': 'multinomial',
 'clf__n_jobs': 1,
 'clf__penalty': 'l2',
 'clf__random_state': None,
 'clf__solver': 'lbfgs',
 'clf__tol': 0.0001,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [13]:
vectorizer = TfidfVectorizer(binary=True, ngram_range=(1,2), max_df=0.98) 
train_features = vectorizer.fit_transform(str_train_data)
test_features = vectorizer.transform(str_test_data)

In [15]:
model = LogisticRegression(C=100000000, solver='lbfgs', multi_class='multinomial', random_state=0)
model.fit(train_features, train_target)
predicted = model.predict(test_features)
    
accuracy = accuracy_score(predicted, test_target)
accuracy

ValueError: Solver lbfgs supports only l2 penalties, got l1 penalty.

74.88 - lbdfgs, 100000000, max_df=.98
74.7 - lbdfgs, 100000000
74.3 - lbfgs
73.6 - SAGA
73.4 - SAG
73.2 - newton-cg

67.3 - l1_ratio = default

In [18]:
model2 = SGDClassifier(loss='log', penalty='elasticnet', l1_ratio=0.2)
model2.fit(train_features, train_target)
predicted2 = model2.predict(test_features)
    
accuracy2 = accuracy_score(predicted2, test_target)
accuracy2

0.672372475179733

In [32]:
#don't think this is working right
precision_score(test_target, predicted2, average=None, labels=['easy','med','hard'], pos_label=2)

array([0.        , 0.67237248, 0.        ])

In [17]:
model3 = LinearSVC(fit_intercept=True, random_state=0)
model3.fit(train_features, train_target)
predicted3 = model3.predict(test_features)
    
accuracy3 = accuracy_score(predicted3, test_target)
accuracy3

0.7187036403058313

In [None]:
# this code saves to csv and loads it back in a weird format with messed up review_stars values
# rest_df.to_csv('show_me_the_data/yelp_business_and_reviews', sep='\t', encoding='utf-8')
# df = pd.read_csv('show_me_the_data/yelp_business_and_reviews',sep='\t',index_col=0, encoding='utf-8')
# df.review_stars.unique()