# Passive Agressive Classifier Model

In [2]:
import sys
import json
import pandas as pd
import numpy as np

In [3]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [4]:
with open('df_by_usr.json', 'r', encoding='UTF-8') as f:
    data = f.readlines()
    df_by_usr_data = list(map(json.loads, data))
    
df_by_usr = pd.DataFrame(df_by_usr_data)

In [5]:
df_by_usr.head(3)

Unnamed: 0,avg_star_delta,reviewer_label,text_agg,user_id
0,-0.318182,med,"Ummm, due to the star ratings on Yelp we made ...",--3WaS23LcIXtxyFULJHTA
1,-0.947368,hard,"In my opinion, this restaurant has the best fo...",--4rAAfZnEIAKJE80aIiYg
2,0.333333,med,All is right with the world. After going to th...,--CIuK7sUpaNzalLAlHJKA


In [8]:
train_data, test_data, train_target, test_target = train_test_split(df_by_usr['text_agg'],
                                                                    df_by_usr['reviewer_label'])

In [9]:
train_data[0:5]

22071    I can't recall a Thai restaurant that I didn't...
29387    Had basic Chinese food - large portions with r...
30912    I had dinner at Crop with a friend tonight. Th...
16992    Please note that my 2-star review is strictly ...
7556     Service took a while even tho it wasent very b...
Name: text_agg, dtype: object

In [10]:
train_target[0:5]

22071     med
29387    easy
30912    easy
16992    hard
7556     easy
Name: reviewer_label, dtype: object

In [11]:
str_test_data = []
for text in test_data:
    str_test_data.append(str(text))

In [12]:
str_train_data = []
for text in train_data:
    str_train_data.append(str(text))

In [13]:
vectorizer = TfidfVectorizer(binary=True, ngram_range=(1,2), max_df=0.95) 

In [27]:
train_features = vectorizer.fit_transform(str_train_data)

In [28]:
test_features = vectorizer.transform(str_test_data)

In [29]:
from sklearn.linear_model import PassiveAggressiveClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html

In [30]:
pac = PassiveAggressiveClassifier().fit(train_features, train_target)



In [31]:
pac.score(test_features, test_target)

0.7341093232911103

In [35]:
pac2 = PassiveAggressiveClassifier(fit_intercept=False).fit(train_features, train_target)
pac2.score(test_features, test_target)



0.7342234394613717

In [None]:
pac.score(test_features, test_target)

In [37]:
from sklearn.model_selection import GridSearchCV

grid_params = dict(C=np.arange(0.5, 1.5, 0.5))
gs_pac = PassiveAggressiveClassifier()
gs = GridSearchCV(estimator=gs_pac,
                  param_grid=grid_params,
                  cv=10)

gs.fit(train_features, train_target)
gs.score(test_features, test_target)



0.734679904142417

In [38]:
gs.best_params_

{'C': 1.0}

In [39]:
grid_params = dict(C=np.arange(0.9, 1.1, 0.5))
gs_pac = PassiveAggressiveClassifier(fit_intercept=False)
gs = GridSearchCV(estimator=gs_pac,
                  param_grid=grid_params,
                  cv=5)

gs.fit(train_features, train_target)
print(gs.score(test_features, test_target))
print(gs.best_params_)



{'C': 0.9}

In [41]:
gs.score(test_features, test_target)

0.7335387424398038

In [42]:
grid_params = dict(C=np.arange(0.9, 1.1, 0.01))
gs_pac = PassiveAggressiveClassifier(fit_intercept=False)
gs = GridSearchCV(estimator=gs_pac,
                  param_grid=grid_params,
                  cv=5)

gs.fit(train_features, train_target)
print(gs.score(test_features, test_target))
print(gs.best_params_)



0.7342234394613717
{'C': 1.04}


In [46]:
PassiveAggressiveClassifier(C=1.04).fit(train_features, train_target).score(test_features, test_target)

0.7342234394613717

In [48]:
PassiveAggressiveClassifier(fit_intercept=True, C=1.0).fit(train_features, train_target).score(test_features, test_target)

0.7341093232911103

In [49]:
PassiveAggressiveClassifier(C=1.04).fit(train_features, train_target).score(test_features, test_target)

0.7326258130777131

In [52]:
PassiveAggressiveClassifier(C=1.04, early_stopping=True).fit(train_features, train_target).score(test_features, test_target)

0.7338810909505877

In [53]:
grid_params = dict(C=np.arange(1.0, 1.05, 0.001))
gs_pac = PassiveAggressiveClassifier(fit_intercept=False)
gs = GridSearchCV(estimator=gs_pac,
                  param_grid=grid_params,
                  cv=5)

gs.fit(train_features, train_target)
print(gs.score(test_features, test_target))
print(gs.best_params_)



0.733995207120849
{'C': 1.0499999999999945}


In [59]:
# Best PAC Model thus far
pac_best = PassiveAggressiveClassifier(C=1.04).fit(train_features, train_target)
pac_best.score(test_features, test_target)

0.7351363688234623