# STEP5: Sentiment Classification with Machine Learning

## Import Dependencies

In [1]:
import os
import time
import pandas as pd
import numpy as np
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
import joblib
from scipy import sparse, io

-------------

## Import Uploaded Dataset

In [5]:
vaders = pd.read_csv('source/vaders.csv')

import csv file 

### Train & Test Decomposition

On supervised learning we need to train algorithms with datasets train part and testing that trained dataset.

In [6]:
x_train, x_test, y_train, y_test = train_test_split(vaders["Text"], vaders["SScoring"], test_size=0.25, random_state=1)

-----------

## Import Vectorizers

### Count Vectors

In [None]:
x_train_count = io.mmread('vectorizers/x_train_count.mtx')
x_test_count = io.mmread('vectorizers/x_test_count.mtx')

### TF-IDF

#### custom n-gram level

In [7]:
x_train_tf_idf_ngram = io.mmread("vectorizers/x_train_tf_idf_ngram")
x_test_tf_idf_ngram = io.mmread("vectorizers/x_test_tf_idf_ngram")

#### word level n_gram

In [None]:
x_train_tf_idf_word = io.mmread("vectorizers/x_train_tf_idf_word")
x_test_tf_idf_word = io.mmread("vectorizers/x_test_tf_idf_word")

#### charachter level n_gram

In [None]:
x_train_tf_idf_chars = io.mmread("vectorizers/x_train_tf_idf_chars")
x_test_tf_idf_chars = io.mmread("vectorizers/x_test_tf_idf_chars")

### Scorer

In [9]:
def matrix_metrix(y_test,y_pred,beta):
   CM = confusion_matrix(y_test,y_pred)
   TN = CM[0][0]
   FN = CM[1][0] 
   TP = CM[1][1]
   FP = CM[0][1]

   Recall     = round( TP / (TP+FN),4 ) 
   Accuracy   = round( (TP+TN) / (TN+FN+TP+FP),4)
   Precision  = round( TP / (TP+FP),4 )
   F1         = round ( 2 * ((Precision*Recall)/(Precision+Recall)),4)

   mat_met = pd.DataFrame({
                'Metric':['TP','TN','FP','FN','Recall','Accuracy','Precision','F1'],
                'Value':[TP,TN,FP,FN,Recall,Accuracy,Precision,F1]})

   return (mat_met)

----------

## Run ML Part

- My pc has not enough sources
- cpu is quiet slow
- these steps running on kaggle
- results will give
- also kaggle's given ram is not enough so i use mtx files above

### Random Forest Classifier

#### Model & Prediction

##### Count Vectors

In [None]:
rf = RandomForestClassifier(verbose=2, n_jobs=1, random_state=1)

In [None]:
print("--> Checking for physical Tensorflow devices")
for device in tf.config.list_physical_devices():
    print(": {}".format(device.name))

In [None]:
with tf.device('/gpu:0'):
    rf_model_count = rf.fit(x_train_count, y_train)

This process takes 2 hour 27 minutes

In [None]:
# train accuracy
accuracy_score(y_train,rf_model_count.predict(x_train_count))

In [None]:
# test accuracy
y_pred_rf_count = rf_model_count.predict(x_test_count)
accuracy_score(y_test, y_pred_rf_count)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_rf_count,beta)
print (mat_met)

##### n-gram level

In [None]:
rf_model_ngram = joblib.load("models/rf_model_ngram.joblib")

In [None]:
with tf.device('/gpu:0'):
    rf_model_ngram = rf.fit(x_train_tf_idf_ngram, y_train)

In [None]:
accuracy_score(y_train,rf_model_ngram.predict(x_train_tf_idf_ngram))

In [None]:
y_pred_rf_ngram = rf_model_ngram.predict(x_test_tf_idf_ngram)
accuracy_score(y_test, y_pred_rf_ngram)

This process takes 7 hour 10 minutes. Really slow because didn't remain n_jobs parameter. n_jobs run multi process.

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_rf_ngram,beta)
print (mat_met)

##### word level

In [None]:
with tf.device('/gpu:0'):
    rf_model_word = rf.fit(x_train_tf_idf_word, y_train)

In [None]:
accuracy_score(y_train,rf_model_word.predict(x_train_tf_idf_word))

In [None]:
y_pred_rf_word = rf_model_word.predict(x_test_tf_idf_word)
accuracy_score(y_test, y_pred_rf_word)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_rf_word,beta)
print (mat_met)

##### character level

In [None]:
with tf.device('/gpu:0'):
    rf_model_chars = rf.fit(x_train_tf_idf_chars, y_train)

In [None]:
accuracy_score(y_train,rf_model_chars.predict(x_train_tf_idf_chars))

In [None]:
y_pred_rf_chars = rf_model_chars.predict(x_test_tf_idf_chars)
accuracy_score(y_test, y_pred_rf_chars)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_rf_chars,beta)
print (mat_met)

-----------

#### Model Tuning

These steps implements rf model tuning. Hard to run so I won't

In [None]:
rf_params = {"n_estimators": [500,1000, 2000],
             "max_depth": [5,8,10],
             "max_features": [2,5,8],
             "min_samples_split": [2,5,8]}

In [None]:
with tf.device('/gpu:0'):
    rf_cv = GridSearchCV(rf_model_word, rf_params, cv=10, n_jobs=-1, verbose=2)
    rf_cv.fit(x_train_tf_idf, y_train)

In [None]:
print("Best Score: "+ str(rf_cv.best_score_))
print("Best Parameters: "+ str(rf_cv.best_params_))

In [None]:
with tf.device('/gpu:0'):
    rf_tuned = RandomForestClassifier(max_depth=8,
                                     max_features=8,
                                     min_samples_split=5,
                                     n_estimators=1000)

    rf_tuned.fit(x_train_tf_idf_word, y_train)

In [None]:
y_pred = rf_tuned.predict(x_test)
accuracy_score(y_test, y_pred)

----------------

### XGBoost Classifier

#### Model & Prediction

##### Count Vectors

In [None]:
xgb = XGBClassifier()

In [None]:
with tf.device('/gpu:0'):
    xgb_model_count = xgb.fit(x_train_count, y_train, verbose=2)

XGBoost training almost 22 times faster than Random Forests!!! It' s nearly takes 7 minutes

In [None]:
accuracy_score(y_train,  xgb_model_count.predict(x_train_count))

In [None]:
y_pred_xgb_count = xgb_model_count.predict(x_test_count)
accuracy_score(y_test, y_pred_xgb_count)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_xgb_count,beta)
print (mat_met)

##### ngram level

In [None]:
with tf.device('/gpu:0'):
    xgb_model_ngram = xgb.fit(x_train_tf_idf_ngram, y_train, verbose=2)

In [None]:
accuracy_score(y_train,  xgb_model_ngram.predict(x_train_tf_idf_ngram))

In [None]:
y_pred_xgb_ngram = xgb_model_ngram.predict(x_test_tf_idf_ngram)
accuracy_score(y_test, y_pred_xgb_ngram)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_xgb_ngram,beta)
print (mat_met)

##### word level

In [None]:
with tf.device('/gpu:0'):
    xgb_model_word = xgb.fit(x_train_tf_idf_word, y_train, verbose=2)

In [None]:
accuracy_score(y_train,  xgb_model_word.predict(x_train_tf_idf_word))

In [None]:
y_pred_xgb_word = xgb_model_word.predict(x_test_tf_idf_word)
accuracy_score(y_test, y_pred_xgb_word)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_xgb_word,beta)
print (mat_met)

##### character level

In [None]:
with tf.device('/gpu:0'):
    xgb_model_chars = xgb.fit(x_train_tf_idf_chars, y_train, verbose=2)

In [None]:
accuracy_score(y_train,  xgb_model_chars.predict(x_train_tf_idf_chars))

In [None]:
y_pred_xgb_chars = xgb_model_chars.predict(x_test_tf_idf_chars)
accuracy_score(y_test, y_pred_xgb_chars)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_xgb_chars,beta)
print (mat_met)

#### Model Tuning

These steps takes too much time and effort.

In [None]:
xgb_params = {"learning_rate": [0.01, 0.001],
              "n_estimators": [500,1000,2000],
              "max_depth": [5,6,7],
              "subsample": [0.6, 0.7, 0.8]}

In [None]:
with tf.device('/gpu:0'):
    xgb_cv = GridSearchCV(xgb_model_word, xgb_params, cv=10, verbose=2)
    xgb_cv.fit(x_train_tf_idf_word, y_train)

In [None]:
print("Best Score: "+ str(xgb_cv.best_score_))
print("Best Parameters: "+ str(xgb_cv.best_params_))

In [None]:
with tf.device('/gpu:0'):
    xgb = XGBClassifier(learning_rate=0.001,
                        max_depth=6,
                        min_samples_split=2,
                        n_estimators=1000,
                        subsample=0.6)

    xgb_tuned = xgb.fit(x_train_tf_idf_word, y_train)

In [None]:
y_pred = xgb_tuned.predict(x_test_tf_idf_word)
accuracy_score(y_test, y_pred)

------------

### Catboost

#### Model & Prediction

##### Count Vectors

In [8]:
cb = CatBoostClassifier(verbose=2, iterations= 500, thread_count=-1, depth=5)

In [None]:
with tf.device('/gpu:0'):
    cb_count = cb.fit(x_train_count, y_train)

In [None]:
accuracy_score(y_train, cb_count.predict(x_train_count))

In [None]:
y_pred_cb_count = cb_count.predict(x_test_count)
accuracy_score(y_test, y_pred_cb_count)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_cb_count,beta)
print (mat_met)

##### ngram level

In [None]:
with tf.device('/gpu:0'):
    cb_ngram = cb.fit(x_train_tf_idf_ngram, y_train)

In [None]:
accuracy_score(y_train, cb_ngram.predict(x_train_tf_idf_ngram))

In [None]:
y_pred_cb_ngram = cb_ngram.predict(x_test_tf_idf_ngram)
accuracy_score(y_test, y_pred_cb_ngram)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_cb_ngram,beta)
print (mat_met)

##### word level

In [None]:
with tf.device('/gpu:0'):
    cb_word = cb.fit(x_train_tf_idf_word, y_train)

In [None]:
accuracy_score(y_train, cb_word.predict(x_train_tf_idf_word))

In [None]:
y_pred_cb_word = cb_word.predict(x_test_tf_idf_word)
accuracy_score(y_test, y_pred_cb_word)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_cb_word,beta)
print (mat_met)

##### character level

In [None]:
with tf.device('/gpu:0'):
    cb_chars = cb.fit(x_train_tf_idf_chars, y_train)

In [None]:
accuracy_score(y_train, cb_chars.predict(x_train_tf_idf_chars))

In [None]:
y_pred_cb_chars = cb_chars.predict(x_test_tf_idf_chars)
accuracy_score(y_test, y_pred_cb_chars)

In [None]:
beta = 0.4
mat_met = matrix_metrix(y_test,y_pred_cb_chars,beta)
print (mat_met)

#### Model Tuning

---------

### Export Fitted Models

#### RF

In [None]:
# save rf count vectors model
joblib.dump(rf_model_count, "models/rf_model_count.joblib")

In [None]:
joblib.dump(rf_model_ngram,"models/rf_model_ngram.joblib")

In [None]:
joblib.dump(rf_model_word,"models/rf_model_word.joblib")

In [None]:
joblib.dump(rf_model_chars,"models/rf_model_chars.joblib")

#### XGB

In [None]:
joblib.dump(xgb_model_count,"models/xgb_model_count.joblib")

In [None]:
joblib.dump(xgb_model_ngram,"models/xgb_model_ngram.joblib")

In [None]:
joblib.dump(xgb_model_word,"models/xgb_model_word.joblib")

In [None]:
joblib.dump(xgb_model_chars,"models/xgb_model_chars.joblib")

#### CB

In [None]:
joblib.dump(cb_count,"models/cb_count.joblib")

In [None]:
joblib.dump(cb_ngram,"models/cb_ngram.joblib")

In [None]:
joblib.dump(cb_word,"models/cb_word.joblib")

In [None]:
joblib.dump(cb_chars,"models/cb_chars.joblib")

In [None]:
## if you wanna compress
#joblib.dump(rf, "RF_compressed.joblib", compress=3)

### Import Models

In [None]:
## load rf count vectors model
#loaded_rf = joblib.load("./rf_model_count.joblib")

In [None]:
## another method
#import cPickle
#rf = RandomForestRegresor()
#rf.fit(X, y)

#with open('path/to/file', 'wb') as f:
#    cPickle.dump(rf, f)

## in your prediction file                                                                                                                                                                                                           
#with open('path/to/file', 'rb') as f:
#    rf = cPickle.load(f)

#preds = rf.predict(new_X)