# Classification Algorithms

In [4]:
#   packages
import gc
import os

import json
import pandas as pd
import numpy as np

import collections

from sklearn.model_selection import train_test_split

## Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Classifiers
##> Logistic Regression
from sklearn.linear_model import LogisticRegression

## prediction score
from sklearn.metrics import accuracy_score

In [5]:
preProcDF = pd.DataFrame.from_dict(json.load(open('data/yelp_example_1.json')))
documents = preProcDF["docs"].str.join(" ")

# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents).toarray()

# Train/Test
labels = preProcDF.scores.tolist()
category_list = preProcDF.scores.unique()
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(X_tfidf, labels, train_size=0.85, stratify=labels, random_state=1)

## Logistic Regression

In [6]:
for c_param in [1.0, 10.0, 50.0, 100.0, 1000.0]:
    model = LogisticRegression(solver='liblinear', C=c_param, random_state=0).fit(X_tfidf_train, y_tfidf_train)
    y_tfidf_test_pred = model.predict(X_tfidf_test)
    print(str(c_param), "Accuracy:", accuracy_score(y_tfidf_test, y_tfidf_test_pred))

1.0 Accuracy: 0.556
10.0 Accuracy: 0.552
50.0 Accuracy: 0.5186666666666667
100.0 Accuracy: 0.516
1000.0 Accuracy: 0.5


## Hyperparameters Tuning

In [7]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

gc.collect()

param_grid = {'solver': ["liblinear", "lbfgs", "newton-cg"]
   ,'penalty': ["l2"]
   ,'C': [1.0, 5.0, 10.0]
   ,'fit_intercept': [True]   #Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.
   ,'multi_class': ['auto']
}

LR_model = LogisticRegression(max_iter=1000)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=1)
grid_search = GridSearchCV(estimator=LR_model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(X_tfidf_train, y_tfidf_train)

In [10]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.555059 using {'C': 5.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'liblinear'}
0.547059 (0.015075) with: {'C': 1.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'liblinear'}
0.550471 (0.016778) with: {'C': 1.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
0.550471 (0.016778) with: {'C': 1.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'newton-cg'}
0.555059 (0.019085) with: {'C': 5.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'liblinear'}
0.551765 (0.018787) with: {'C': 5.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}
0.551765 (0.018787) with: {'C': 5.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'newton-cg'}
0.548471 (0.019157) with: {'C': 10.0, 'fit_intercept': True, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'liblinear'}
0.545529 (0.021530) with: {'

In [11]:
y_tfidf_test_pred = grid_result.predict(X_tfidf_test)
print("Accuracy:", accuracy_score(y_tfidf_test, y_tfidf_test_pred))

Accuracy: 0.556
