### Reading Given Dataset

In [85]:
import pandas as pd
import numpy as np
import pickle
import re
import itertools
import scipy
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectPercentile, SelectKBest, SelectFpr, chi2, mutual_info_classif
from sklearn.model_selection import train_test_split

train_set = pd.read_csv('review_meta_train.csv')
test_set = pd.read_csv('review_meta_test.csv')

vocab = pickle.load(open("train_countvectorizer.pkl", "rb"))
vocab_dict = vocab.vocabulary_
vec_train_text_matrix = scipy.sparse.load_npz('review_text_train_vec.npz')
vec_test_text_matrix = scipy.sparse.load_npz('review_text_test_vec.npz')
doc2vec50_train_text_matrix  = pd.read_csv(r"review_text_train_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
doc2vec50_test_text_matrix  = pd.read_csv(r"review_text_test_doc2vec50.csv", index_col = False, delimiter = ',', header=None)
doc2vec100_train_text_matrix  = pd.read_csv(r"review_text_train_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
doc2vec100_test_text_matrix  = pd.read_csv(r"review_text_test_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
doc2vec200_train_text_matrix  = pd.read_csv(r"review_text_train_doc2vec200.csv", index_col = False, delimiter = ',', header=None)
doc2vec200_test_text_matrix  = pd.read_csv(r"review_text_test_doc2vec200.csv", index_col = False, delimiter = ',', header=None)



In [41]:
train_set_top50 = pd.concat([train_set, doc2vec50_train_text_matrix], axis = 1).drop(columns = ["rating","date","review_id", "reviewer_id", "business_id"])
train_set_top100 = pd.concat([train_set, doc2vec100_train_text_matrix], axis = 1).drop(columns = ["rating","date","review_id", "reviewer_id", "business_id"])
train_set_top200 = pd.concat([train_set, doc2vec200_train_text_matrix], axis = 1).drop(columns = ["rating","date","review_id", "reviewer_id", "business_id"])
test_set_top50 = pd.concat([test_set, doc2vec50_test_text_matrix], axis = 1).drop(columns = ["date","review_id", "reviewer_id", "business_id"])
test_set_top100 = pd.concat([test_set, doc2vec100_test_text_matrix], axis = 1).drop(columns = ["date","review_id", "reviewer_id", "business_id"])
test_set_top200 = pd.concat([test_set, doc2vec200_test_text_matrix], axis = 1).drop(columns = ["date","review_id", "reviewer_id", "business_id"])

In [65]:
train_set_top50_nometa = doc2vec50_train_text_matrix
train_set_top100_nometa = doc2vec100_train_text_matrix
train_set_top200_nometa = doc2vec200_train_text_matrix
test_set_top50_nometa = doc2vec50_test_text_matrix
test_set_top100_nometa = doc2vec100_test_text_matrix
test_set_top200_nometa = doc2vec200_test_text_matrix

In [42]:
y_train = train_set["rating"]

### Best feature selection

In [132]:
from sklearn.feature_extraction.text import TfidfTransformer
#using vectorized text given only
train_countvec = vec_train_text_matrix
y_train = train_set["rating"]
#75-25 test train split
X_train_countvec, X_test_countvec, y_train_countvec, y_test_countvec = train_test_split(train_countvec, y_train, test_size=0.25, random_state=42)
features = list(vocab_dict.keys())
X_train_tfidf = TfidfTransformer().fit_transform(X_train_countvec)
X_test_tfidf = TfidfTransformer().fit_transform(X_test_countvec)

In [134]:
#best features with chi square
top50_best_chi2 = SelectKBest(chi2, k = 50).fit(X_train_countvec, y_train_countvec)
top50_best_chi2_tfidf = SelectKBest(chi2, k = 50).fit(X_train_tfidf, y_train_countvec)
top100_best_chi2 = SelectKBest(chi2, k = 100).fit(X_train_countvec, y_train_countvec)
top100_best_chi2_tfidf = SelectKBest(chi2, k = 100).fit(X_train_tfidf, y_train_countvec)
top200_best_chi2 = SelectKBest(chi2, k = 200).fit(X_train_countvec, y_train_countvec)
top200_best_chi2_tfidf = SelectKBest(chi2, k = 200).fit(X_train_tfidf, y_train_countvec)

In [135]:
#bet features with mutual info
top50_best_mi = SelectKBest(mutual_info_classif, k = 50).fit(X_train_countvec, y_train_countvec)
top50_best_mi_tfidf = SelectKBest(mutual_info_classif, k = 50).fit(X_train_tfidf, y_train_countvec)
top100_best_mi = SelectKBest(mutual_info_classif, k = 100).fit(X_train_countvec, y_train_countvec)
top100_best_mi_tfidf = SelectKBest(mutual_info_classif, k = 100).fit(X_train_tfidf, y_train_countvec)
top200_best_mi =SelectKBest(mutual_info_classif, k = 200).fit(X_train_countvec, y_train_countvec)
top200_best_mi_tfidf = SelectKBest(mutual_info_classif, k = 200).fit(X_train_tfidf, y_train_countvec)

In [None]:
k_bests = [(top50_best_chi2, top50_best_mi,top50_best_chi2_tfidf, top50_best_mi_tfidf),(top100_best_chi2, top100_best_mi, top100_best_chi2_tfidf, top100_best_mi_tfidf  ), ( top200_best_chi2, top200_best_mi, top200_best_chi2_tfidf, top200_best_mi_tfidf)]

### Classification

In [75]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB


In [45]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [76]:
models = [
          GaussianNB(),
          LinearSVC(C=1.0, max_iter=10000, tol = 1e-05),
          LogisticRegression(C = 1.0,solver = "lbfgs", multi_class='multinomial', max_iter=1000, tol = 1e-05)
]
titles = ['Gaussian Naive Bayes',
          'LinearSVC',
          'Logistic Regression']

X_names = ['x2', 'mi']

#### doc2vec without meta

top 50 features

In [54]:
X_train_docvec50, X_test_docvec50, y_train_docvec50, y_test_docvec50 = train_test_split(train_set_top50_nometa, y_train, test_size=0.25, random_state=42)
for title, model in zip(titles, models):
    print(title, "\n")
    X_train_t, X_test_t = X
    crossval = np.mean(cross_val_score(model, X_train_docvec50, y_train_docvec50, cv=5))
    clsfier = model.fit(X_train_docvec50,  y_train_docvec50)
    y_pred = clsfier.predict(X_test_docvec50)
    print(classification_report(  y_test_docvec50, y_pred))
    acc = model.score( X_test_docvec50, y_test_docvec50 )
    print('acc \t',  acc)
    print('crossval \t',  crossval)

Gaussian Naive Bayes 

              precision    recall  f1-score   support

           1       0.38      0.51      0.44       596
           3       0.51      0.47      0.49      1566
           5       0.83      0.82      0.83      4855

   micro avg       0.72      0.72      0.72      7017
   macro avg       0.58      0.60      0.58      7017
weighted avg       0.72      0.72      0.72      7017

acc 	 0.7164030212341457
crossval 	 0.7246217931867791
LinearSVC 





              precision    recall  f1-score   support

           1       0.70      0.42      0.53       596
           3       0.68      0.56      0.61      1566
           5       0.85      0.95      0.90      4855

   micro avg       0.82      0.82      0.82      7017
   macro avg       0.75      0.64      0.68      7017
weighted avg       0.80      0.82      0.80      7017

acc 	 0.8150206641014679
crossval 	 0.8116961770524977
Logistic Regression 

              precision    recall  f1-score   support

           1       0.68      0.49      0.57       596
           3       0.67      0.59      0.63      1566
           5       0.87      0.93      0.90      4855

   micro avg       0.82      0.82      0.82      7017
   macro avg       0.74      0.67      0.70      7017
weighted avg       0.81      0.82      0.81      7017

acc 	 0.8168733076813453
crossval 	 0.8167794594241433


In [149]:
top 100 features

SyntaxError: invalid syntax (<ipython-input-149-d8506e60d33f>, line 1)

In [56]:
X_train_docvec100, X_test_docvec100, y_train_docvec100, y_test_docvec100 = train_test_split(train_set_top100_nometa, y_train, test_size=0.25, random_state=42)
for title, model in zip(titles, models):
    print(title, "\n")
    X_train_t, X_test_t = X
    crossval = np.mean(cross_val_score(model, X_train_docvec100, y_train_docvec100, cv=5))
    clsfier = model.fit(X_train_docvec100,  y_train_docvec100)
    y_pred = clsfier.predict(X_test_docvec100)
    print(classification_report(  y_test_docvec100, y_pred))
    acc = model.score( X_test_docvec100, y_test_docvec100 )
    print('acc \t',  acc)
    print('crossval \t',  crossval)

Gaussian Naive Bayes 

              precision    recall  f1-score   support

           1       0.31      0.53      0.39       596
           3       0.46      0.45      0.46      1566
           5       0.82      0.76      0.79      4855

   micro avg       0.67      0.67      0.67      7017
   macro avg       0.53      0.58      0.54      7017
weighted avg       0.70      0.67      0.68      7017

acc 	 0.668804332335756
crossval 	 0.6698489666526914
LinearSVC 





              precision    recall  f1-score   support

           1       0.77      0.46      0.58       596
           3       0.70      0.59      0.64      1566
           5       0.86      0.95      0.90      4855

   micro avg       0.83      0.83      0.83      7017
   macro avg       0.77      0.67      0.71      7017
weighted avg       0.82      0.83      0.82      7017

acc 	 0.8251389482684908
crossval 	 0.8215290379499685
Logistic Regression 

              precision    recall  f1-score   support

           1       0.72      0.53      0.61       596
           3       0.68      0.62      0.65      1566
           5       0.88      0.93      0.90      4855

   micro avg       0.83      0.83      0.83      7017
   macro avg       0.76      0.69      0.72      7017
weighted avg       0.82      0.83      0.82      7017

acc 	 0.8275616360267921
crossval 	 0.8241890546876318


top 200 features

In [57]:
X_train_docvec200, X_test_docvec200, y_train_docvec200, y_test_docvec200 = train_test_split(train_set_top200_nometa, y_train, test_size=0.25, random_state=42)
for title, model in zip(titles, models):
    print(title, "\n")
    X_train_t, X_test_t = X
    crossval = np.mean(cross_val_score(model, X_train_docvec200, y_train_docvec200, cv=5))
    clsfier = model.fit(X_train_docvec200,  y_train_docvec200)
    y_pred = clsfier.predict(X_test_docvec200)
    print(classification_report(  y_test_docvec200, y_pred))
    acc = model.score( X_test_docvec200, y_test_docvec200 )
    print('acc \t',  acc)
    print('crossval \t',  crossval)

Gaussian Naive Bayes 

              precision    recall  f1-score   support

           1       0.23      0.53      0.32       596
           3       0.42      0.39      0.40      1566
           5       0.80      0.70      0.75      4855

   micro avg       0.61      0.61      0.61      7017
   macro avg       0.48      0.54      0.49      7017
weighted avg       0.67      0.61      0.63      7017

acc 	 0.6129400028502209
crossval 	 0.6138901052177602
LinearSVC 





              precision    recall  f1-score   support

           1       0.76      0.48      0.59       596
           3       0.71      0.62      0.66      1566
           5       0.87      0.95      0.91      4855

   micro avg       0.83      0.83      0.83      7017
   macro avg       0.78      0.68      0.72      7017
weighted avg       0.82      0.83      0.82      7017

acc 	 0.8325495225880005
crossval 	 0.8265157286697621
Logistic Regression 

              precision    recall  f1-score   support

           1       0.71      0.55      0.62       596
           3       0.70      0.64      0.67      1566
           5       0.89      0.94      0.91      4855

   micro avg       0.84      0.84      0.84      7017
   macro avg       0.76      0.71      0.73      7017
weighted avg       0.83      0.84      0.83      7017

acc 	 0.8361122987031495
crossval 	 0.8295085465275273


#### doc2vec with meta (voting result only)

Top 50 + meta

In [47]:
X_train_docvec50, X_test_docvec50, y_train_docvec50, y_test_docvec50 = train_test_split(train_set_top50, y_train, test_size=0.25, random_state=42)
for title, model in zip(titles, models):
    print(title, "\n")
    X_train_t, X_test_t = X
    crossval = np.mean(cross_val_score(model, X_train_docvec50, y_train_docvec50, cv=5))
    clsfier = model.fit(X_train_docvec50,  y_train_docvec50)
    y_pred = clsfier.predict(X_test_docvec50)
    print(classification_report(  y_test_docvec50, y_pred))
    acc = model.score( X_test_docvec50, y_test_docvec50 )
    print('acc \t',  acc)
    print('crossval \t',  crossval)

Gaussian Naive Bayes 

              precision    recall  f1-score   support

           1       0.36      0.41      0.39       596
           3       0.49      0.57      0.52      1566
           5       0.85      0.79      0.82      4855

   micro avg       0.71      0.71      0.71      7017
   macro avg       0.57      0.59      0.58      7017
weighted avg       0.73      0.71      0.72      7017

acc 	 0.7075673364685763
crossval 	 0.7179231734149081
LinearSVC 





              precision    recall  f1-score   support

           1       0.72      0.43      0.54       596
           3       0.68      0.56      0.61      1566
           5       0.85      0.95      0.90      4855

   micro avg       0.82      0.82      0.82      7017
   macro avg       0.75      0.65      0.68      7017
weighted avg       0.81      0.82      0.81      7017

acc 	 0.8171583297705572
crossval 	 0.8156861570227069
Logistic Regression 

              precision    recall  f1-score   support

           1       0.70      0.48      0.57       596
           3       0.67      0.60      0.64      1566
           5       0.87      0.93      0.90      4855

   micro avg       0.82      0.82      0.82      7017
   macro avg       0.75      0.67      0.70      7017
weighted avg       0.81      0.82      0.81      7017

acc 	 0.8218611942425538
crossval 	 0.8204371573415383


Top 100 + meta

In [48]:
X_train_docvec100, X_test_docvec100, y_train_docvec100, y_test_docvec100 = train_test_split(train_set_top100, y_train, test_size=0.25, random_state=42)
for title, model in zip(titles, models):
    print(title, "\n")
    X_train_t, X_test_t = X
    crossval = np.mean(cross_val_score(model, X_train_docvec100, y_train_docvec100, cv=5))
    clsfier = model.fit(X_train_docvec100,  y_train_docvec100)
    y_pred = clsfier.predict(X_test_docvec100)
    print(classification_report(  y_test_docvec100, y_pred))
    acc = model.score( X_test_docvec100, y_test_docvec100 )
    print('acc \t',  acc)
    print('crossval \t',  crossval)

Gaussian Naive Bayes 

              precision    recall  f1-score   support

           1       0.32      0.46      0.38       596
           3       0.44      0.53      0.48      1566
           5       0.83      0.73      0.78      4855

   micro avg       0.66      0.66      0.66      7017
   macro avg       0.53      0.57      0.55      7017
weighted avg       0.70      0.66      0.68      7017

acc 	 0.6642439789083654
crossval 	 0.6684717118765812
LinearSVC 





              precision    recall  f1-score   support

           1       0.77      0.48      0.59       596
           3       0.70      0.59      0.64      1566
           5       0.86      0.95      0.90      4855

   micro avg       0.83      0.83      0.83      7017
   macro avg       0.78      0.67      0.71      7017
weighted avg       0.82      0.83      0.82      7017

acc 	 0.8281316802052159
crossval 	 0.8259944384196907
Logistic Regression 

              precision    recall  f1-score   support

           1       0.75      0.54      0.63       596
           3       0.69      0.63      0.65      1566
           5       0.88      0.93      0.90      4855

   micro avg       0.83      0.83      0.83      7017
   macro avg       0.77      0.70      0.73      7017
weighted avg       0.82      0.83      0.83      7017

acc 	 0.830981901097335
crossval 	 0.8293196511709848


Top 200 + meta

In [49]:
X_train_docvec200, X_test_docvec200, y_train_docvec200, y_test_docvec200 = train_test_split(train_set_top200, y_train, test_size=0.25, random_state=42)
for title, model in zip(titles, models):
    print(title, "\n")
    crossval = np.mean(cross_val_score(model, X_train_docvec200, y_train_docvec200, cv=5))
    clsfier = model.fit(X_train_docvec200,  y_train_docvec200)
    y_pred = clsfier.predict(X_test_docvec200)
    print(classification_report(  y_test_docvec200, y_pred))
    acc = model.score( X_test_docvec200, y_test_docvec200 )
    print('acc \t',  acc)
    print('crossval \t',  crossval)

Gaussian Naive Bayes 

              precision    recall  f1-score   support

           1       0.25      0.48      0.33       596
           3       0.39      0.45      0.42      1566
           5       0.80      0.68      0.74      4855

   micro avg       0.61      0.61      0.61      7017
   macro avg       0.48      0.53      0.49      7017
weighted avg       0.67      0.61      0.63      7017

acc 	 0.6099472709134958
crossval 	 0.6165969621361289
LinearSVC 





              precision    recall  f1-score   support

           1       0.77      0.48      0.60       596
           3       0.71      0.61      0.66      1566
           5       0.87      0.95      0.90      4855

   micro avg       0.83      0.83      0.83      7017
   macro avg       0.79      0.68      0.72      7017
weighted avg       0.82      0.83      0.82      7017

acc 	 0.8326920336326066
crossval 	 0.8292712086521294
Logistic Regression 

              precision    recall  f1-score   support

           1       0.73      0.56      0.63       596
           3       0.70      0.65      0.67      1566
           5       0.89      0.93      0.91      4855

   micro avg       0.84      0.84      0.84      7017
   macro avg       0.77      0.71      0.74      7017
weighted avg       0.83      0.84      0.83      7017

acc 	 0.8383924754168448
crossval 	 0.8329767623166626


### Frequency Vectorizer and TF-IDF train

Multinomial Naive Bayes, SVM, Logistic

In [50]:
models = [
          MultinomialNB(),
          LinearSVC(C=1.0, max_iter=10000, tol = 1e-05),
          LogisticRegression(C = 1.0,solver = "lbfgs", multi_class='multinomial', max_iter=1000, tol = 1e-05)
]
titles = ['Multinomial Naive Bayes',
          'LinearSVC',
          'Logistic Regression']
X_names = ['x2', 'mi']

Gaussian Naive Bayes

In [63]:
index = 0
model = GaussianNB()
titles = ["Gaussian NB"]
for k_best_chi2, k_best_mi,k_best_chi2_tfidf, k_best_mi_tfidf in k_bests:
    if(index == 0):
        print("----Top 50 Features----\n")
    elif(index == 1):
        print("----Top 100 Features----\n")
    elif(index == 2):
        print("----Top 200 Features----\n")
    X_train_countvec_x2 = k_best_chi2.transform(X_train_countvec)
    X_test_countvec_x2 = k_best_chi2.transform(X_test_countvec)
    X_train_countvec_mi = k_best_mi.transform(X_train_countvec)
    X_test_countvec_mi = k_best_mi.transform(X_test_countvec)


    X_train_tfidf_x2 = k_best_chi2_tfidf.transform(X_train_tfidf)
    X_test_tfidf_x2 = k_best_chi2_tfidf.transform(X_test_tfidf)
    X_train_tfidf_mi = k_best_mi_tfidf.transform(X_train_tfidf)
    X_test_tfidf_mi = k_best_mi_tfidf.transform(X_test_tfidf)


    Xs_count_vec = [(X_train_countvec_x2, X_test_countvec_x2), (X_train_countvec_mi,X_test_countvec_mi)]
    Xs_tfidf = [(X_train_tfidf_x2,X_test_tfidf_x2), (X_train_tfidf_mi,X_test_tfidf_mi)]

    for title, model in zip(titles, models):
        print(title + "\n")
        print("Frequency Vectorizer")
        for X_name, X in zip(X_names, Xs_count_vec):
            print(X_name, '\n')
            X_train_t, X_test_t = X
            crossval = np.mean(cross_val_score(model, X_train_t.todense(), y_train_countvec, cv=5))
            clsfier = model.fit(X_train_t.todense(), y_train_countvec)
            y_pred = clsfier.predict(X_test_t.todense())
            print(classification_report(y_test_countvec, y_pred))
            acc = model.score(X_test_t.todense(),y_test_countvec)
            print('acc \t',  acc)
            print('crossval \t',  crossval, "\n")
        print("TF IDF Vectorizer")
        for X_name, X in zip(X_names, Xs_tfidf):
            print(X_name, '\n')
            X_train_t, X_test_t = X
            crossval = np.mean(cross_val_score(model, X_train_t.todense(), y_train_countvec, cv=5))
            clsfier = model.fit(X_train_t.todense(), y_train_countvec)
            y_pred = clsfier.predict(X_test_t.todense())
            print(classification_report(y_test_countvec, y_pred))
            acc = model.score(X_test_t.todense(),y_test_countvec)
            print('acc \t',  acc)
            print('crossval \t',  crossval, "\n")
    index = index + 1

----Top 50 Features----

Gaussian NB

Frequency Vectorizer
x2 

              precision    recall  f1-score   support

           1       0.53      0.55      0.54       596
           3       0.52      0.34      0.41      1566
           5       0.81      0.89      0.85      4855

   micro avg       0.74      0.74      0.74      7017
   macro avg       0.62      0.60      0.60      7017
weighted avg       0.72      0.74      0.72      7017

acc 	 0.7407724098617643
crossval 	 0.7413431622109536 

mi 

              precision    recall  f1-score   support

           1       0.56      0.59      0.58       596
           3       0.54      0.38      0.45      1566
           5       0.82      0.89      0.85      4855

   micro avg       0.75      0.75      0.75      7017
   macro avg       0.64      0.62      0.62      7017
weighted avg       0.73      0.75      0.74      7017

acc 	 0.7501781388057575
crossval 	 0.7514613521398182 

TF IDF Vectorizer
x2 

              precision    recal

In [24]:
index = 0
for k_best_chi2, k_best_mi in k_bests:
    if(index == 0):
        print("----Top 50 Features----\n")
    elif(index == 1):
        print("----Top 100 Features----\n")
    elif(index == 2):
        print("----Top 200 Features----\n")
    X_train_countvec_x2 = k_best_chi2.transform(X_train_countvec)
    X_test_countvec_x2 = k_best_chi2.transform(X_test_countvec)
    X_train_countvec_mi = k_best_mi.transform(X_train_countvec)
    X_test_countvec_mi = k_best_mi.transform(X_test_countvec)


    X_train_tfidf_x2 = k_best_chi2.transform(X_train_tfidf)
    X_test_tfidf_x2 = k_best_chi2.transform(X_test_tfidf)
    X_train_tfidf_mi = k_best_mi.transform(X_train_tfidf)
    X_test_tfidf_mi = k_best_mi.transform(X_test_tfidf)


    Xs_count_vec = [(X_train_countvec_x2, X_test_countvec_x2), (X_train_countvec_mi,X_test_countvec_mi)]
    Xs_tfidf = [(X_train_tfidf_x2,X_test_tfidf_x2), (X_train_tfidf_mi,X_test_tfidf_mi)]

    for title, model in zip(titles, models):
        print(title + "\n")
        print("Frequency Vectorizer")
        for X_name, X in zip(X_names, Xs_count_vec):
            print(X_name, '\n')
            X_train_t, X_test_t = X
            crossval = np.mean(cross_val_score(model, X_train_t.todense(), y_train_countvec, cv=5))
            clsfier = model.fit(X_train_t.todense(), y_train_countvec)
            y_pred = clsfier.predict(X_test_t.todense())
            print(classification_report(y_test_countvec, y_pred))
            acc = model.score(X_test_t.todense(),y_test_countvec)
            print('acc \t',  acc)
            print('crossval \t',  crossval, "\n")
        print("TF IDF Vectorizer")
        for X_name, X in zip(X_names, Xs_tfidf):
            print(X_name, '\n')
            X_train_t, X_test_t = X
            crossval = np.mean(cross_val_score(model, X_train_t.todense(), y_train_countvec, cv=5))
            clsfier = model.fit(X_train_t.todense(), y_train_countvec)
            y_pred = clsfier.predict(X_test_t.todense())
            print(classification_report(y_test_countvec, y_pred))
            acc = model.score(X_test_t.todense(),y_test_countvec)
            print('acc \t',  acc)
            print('crossval \t',  crossval, "\n")
    index = index + 1

----Top 50 Features----

Multinomial Naive Bayes

Frequency Vectorizer
x2 

              precision    recall  f1-score   support

           1       0.64      0.53      0.58       596
           3       0.60      0.48      0.54      1566
           5       0.83      0.91      0.87      4855

   micro avg       0.78      0.78      0.78      7017
   macro avg       0.69      0.64      0.66      7017
weighted avg       0.77      0.78      0.77      7017

acc 	 0.7796779250391905
crossval 	 0.7783959004065595 

mi 

              precision    recall  f1-score   support

           1       0.67      0.53      0.59       596
           3       0.61      0.52      0.56      1566
           5       0.85      0.91      0.87      4855

   micro avg       0.79      0.79      0.79      7017
   macro avg       0.71      0.65      0.68      7017
weighted avg       0.78      0.79      0.78      7017

acc 	 0.7885136098047598
crossval 	 0.7884664828378583 

TF IDF Vectorizer
x2 

              precis



              precision    recall  f1-score   support

           1       0.72      0.43      0.54       596
           3       0.65      0.37      0.48      1566
           5       0.80      0.95      0.87      4855

   micro avg       0.78      0.78      0.78      7017
   macro avg       0.73      0.59      0.63      7017
weighted avg       0.76      0.78      0.75      7017

acc 	 0.7785378366823429
crossval 	 0.7801055500761985 

mi 





              precision    recall  f1-score   support

           1       0.72      0.46      0.56       596
           3       0.66      0.41      0.51      1566
           5       0.81      0.95      0.88      4855

   micro avg       0.79      0.79      0.79      7017
   macro avg       0.73      0.61      0.65      7017
weighted avg       0.77      0.79      0.77      7017

acc 	 0.7886561208493659
crossval 	 0.788419033317293 

TF IDF Vectorizer
x2 

              precision    recall  f1-score   support

           1       0.75      0.42      0.54       596
           3       0.67      0.41      0.51      1566
           5       0.81      0.95      0.88      4855

   micro avg       0.79      0.79      0.79      7017
   macro avg       0.74      0.59      0.64      7017
weighted avg       0.77      0.79      0.76      7017

acc 	 0.7869459883140943
crossval 	 0.7857583846716271 

mi 

              precision    recall  f1-score   support

           1       0.76      0.42      0.5



              precision    recall  f1-score   support

           1       0.74      0.51      0.60       596
           3       0.68      0.43      0.53      1566
           5       0.82      0.95      0.88      4855

   micro avg       0.80      0.80      0.80      7017
   macro avg       0.75      0.63      0.67      7017
weighted avg       0.78      0.80      0.78      7017

acc 	 0.7963517172580875
crossval 	 0.7985849210162954 

mi 





              precision    recall  f1-score   support

           1       0.72      0.49      0.58       596
           3       0.69      0.47      0.56      1566
           5       0.83      0.95      0.89      4855

   micro avg       0.80      0.80      0.80      7017
   macro avg       0.75      0.64      0.68      7017
weighted avg       0.79      0.80      0.79      7017

acc 	 0.8041898247114151
crossval 	 0.8059002717147992 

TF IDF Vectorizer
x2 

              precision    recall  f1-score   support

           1       0.77      0.49      0.60       596
           3       0.68      0.48      0.56      1566
           5       0.83      0.95      0.89      4855

   micro avg       0.81      0.81      0.81      7017
   macro avg       0.76      0.64      0.68      7017
weighted avg       0.79      0.81      0.79      7017

acc 	 0.8054724241128688
crossval 	 0.8061857812913367 

mi 

              precision    recall  f1-score   support

           1       0.75      0.48      0.



              precision    recall  f1-score   support

           1       0.76      0.56      0.64       596
           3       0.70      0.51      0.59      1566
           5       0.85      0.95      0.90      4855

   micro avg       0.82      0.82      0.82      7017
   macro avg       0.77      0.67      0.71      7017
weighted avg       0.81      0.82      0.81      7017

acc 	 0.8174433518597691
crossval 	 0.8166837027935806 

mi 





              precision    recall  f1-score   support

           1       0.76      0.57      0.65       596
           3       0.71      0.52      0.60      1566
           5       0.85      0.95      0.90      4855

   micro avg       0.82      0.82      0.82      7017
   macro avg       0.77      0.68      0.71      7017
weighted avg       0.81      0.82      0.81      7017

acc 	 0.8225737494655836
crossval 	 0.8222417511885937 

TF IDF Vectorizer
x2 

              precision    recall  f1-score   support

           1       0.77      0.54      0.64       596
           3       0.70      0.53      0.61      1566
           5       0.85      0.95      0.90      4855

   micro avg       0.82      0.82      0.82      7017
   macro avg       0.78      0.68      0.71      7017
weighted avg       0.81      0.82      0.81      7017

acc 	 0.8225737494655836
crossval 	 0.8260420684853995 

mi 

              precision    recall  f1-score   support

           1       0.79      0.55      0.

### Chosen Learner

In [118]:
import seaborn as sns
model = LogisticRegression(C = 1.0,solver = "lbfgs", multi_class='multinomial', max_iter=1000, tol = 1e-05)
category = ["1", "3", "5"]
X_train_docvec200, X_test_docvec200, y_train_docvec200, y_test_docvec200 = train_test_split(train_set_top200, y_train, test_size=0.25, random_state=42)
crossval = np.mean(cross_val_score(model, X_train_docvec200, y_train_docvec200, cv=5))
clsfier = model.fit(X_train_docvec200,  y_train_docvec200)
y_pred = clsfier.predict(X_test_docvec200)
print(classification_report(y_test_docvec200, y_pred))
acc = model.score( X_test_docvec200, y_test_docvec200 )
print('acc \t',  acc)
print('crossval \t',  crossval)
df = pd.DataFrame(confusion_matrix(y_test_docvec200, y_pred), index=category, columns=category)
print(df)

              precision    recall  f1-score   support

           1       0.73      0.56      0.63       596
           3       0.70      0.65      0.67      1566
           5       0.89      0.93      0.91      4855

   micro avg       0.84      0.84      0.84      7017
   macro avg       0.77      0.71      0.74      7017
weighted avg       0.83      0.84      0.83      7017

acc 	 0.8383924754168448
crossval 	 0.8329767623166626
     1     3     5
1  333   142   121
3   89  1012   465
5   34   283  4538


### Kaggle Submission

In [67]:
model = LogisticRegression(C = 1.0,solver = "lbfgs", multi_class='multinomial', max_iter=1000, tol = 1e-05)

clsfier = model.fit(train_set_top200,  y_train)
y_pred = clsfier.predict(test_set_top200)

In [72]:
import csv

with open("kaggleresult.csv", "w") as f:
    writer = csv.writer(f)
    i = 1
    writer.writerow(["Instance_id", "rating"])
    for predictions in y_pred:
        writer.writerow([str(i), str(predictions)]);
        i += 1

In [139]:
model = LogisticRegression(C = 1.0,solver = "lbfgs", multi_class='multinomial', max_iter=1000, tol = 1e-05)

clsfier = model.fit(k_best_mi.transform(vec_train_text_matrix).todense(),  y_train)
y_pred = clsfier.predict(k_best_mi.transform(vec_test_text_matrix.todense()))

In [140]:
with open("kaggleresult2.csv", "w") as f:
    writer = csv.writer(f)
    i = 1
    writer.writerow(["Instance_id", "rating"])
    for predictions in y_pred:
        writer.writerow([str(i), str(predictions)]);
        i += 1

In [141]:
model = LogisticRegression(C = 1.0,solver = "lbfgs", multi_class='multinomial', max_iter=1000, tol = 1e-05)

clsfier = model.fit(train_set_top200_nometa,  y_train)
y_pred = clsfier.predict(test_set_top200_nometa)

In [143]:
with open("kaggleresult3.csv", "w") as f:
    writer = csv.writer(f)
    i = 1
    writer.writerow(["Instance_id", "rating"])
    for predictions in y_pred:
        writer.writerow([str(i), str(predictions)]);
        i += 1

In [144]:
model = LinearSVC(C=1.0, max_iter=10000, tol = 1e-05)

clsfier = model.fit(train_set_top200_nometa,  y_train)
y_pred = clsfier.predict(test_set_top200_nometa)



In [148]:
with open("kaggleresult4.csv", "w") as f:
    writer = csv.writer(f)
    i = 1
    writer.writerow(["Instance_id", "rating"])
    for predictions in y_pred:
        writer.writerow([str(i), str(predictions)]);
        i += 1

In [146]:
model = LinearSVC(C=1.0, max_iter=10000, tol = 1e-05)

clsfier = model.fit(train_set_top200,  y_train)
y_pred = clsfier.predict(test_set_top200)



In [147]:
with open("kaggleresult5.csv", "w") as f:
    writer = csv.writer(f)
    i = 1
    writer.writerow(["Instance_id", "rating"])
    for predictions in y_pred:
        writer.writerow([str(i), str(predictions)]);
        i += 1