In [1]:
# General Imports
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm,trange
import time


# Classifier Imports
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
data = pd.read_csv('output/Amazon_reviews_mid_f1.csv', dtype=str,)
data.head()

Unnamed: 0,text,star_rating,sentiment_class,category_class
0,choose right list item beautiful intricate det...,5.0,Positive,product
1,sorry bill work downtown washington november j...,1.0,Negative,product
2,great taste start say book amazing world good ...,5.0,Positive,product
3,beyond religion endorsement support another su...,5.0,Positive,product
4,unique work generation edit would point ridicu...,5.0,Positive,product


In [3]:
X = data["text"].values
category_names = ['star_rating','sentiment_class', 'category_class']
Y = data[category_names].values

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1 )
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1)

In [5]:
print(f"X_train: {X_train.shape} | X_val: {X_val.shape} | X_test: {X_test.shape} | \n" + 
          f"y_train: {Y_train.shape} | y_val: {Y_val.shape} | y_test: {Y_test.shape} | ")

X_train: (44651,) | X_val: (4962,) | X_test: (5513,) | 
y_train: (44651, 3) | y_val: (4962, 3) | y_test: (5513, 3) | 


In [6]:
pipeline = Pipeline([('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('clf', MultiOutputClassifier(estimator = AdaBoostClassifier()))])


In [7]:
parameters = {'clf__estimator__n_estimators': [50] } #[10, 20, 30, 40, 50]

In [8]:
model = GridSearchCV(pipeline, param_grid=parameters, n_jobs = 6)

In [9]:
model.fit(X_train, Y_train)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer()),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())]))])),
                                       ('clf',
                                        MultiOutputClassifier(estimator=AdaBoostClassifier()))]),
             n_jobs=6, param_grid={'clf__estimator__n_estimators': [50]})

In [10]:
Y_pred = model.predict(X_test)

In [11]:
print("Best Parameters:", model.best_params_)

Best Parameters: {'clf__estimator__n_estimators': 50}


In [12]:
for column in range(Y_test.shape[1]):
    print("Column name: {}".format(category_names[column]))
    print("classification_report: \n",classification_report(Y_test[:,column],Y_pred[:,column],digits=6))
#     print("confusion_matrix: \n",confusion_matrix(Y_test[:,column],Y_pred[:,column]))

Column name: star_rating
classification_report: 
               precision    recall  f1-score   support

         1.0   0.566667  0.266458  0.362473       638
         2.0   0.456579  0.327668  0.381528      1059
         3.0   0.287489  0.636542  0.396088       509
         4.0   0.445987  0.412158  0.428406      1793
         5.0   0.591971  0.652576  0.620798      1514

    accuracy                       0.465808      5513
   macro avg   0.469738  0.459080  0.437859      5513
weighted avg   0.487444  0.465808  0.461623      5513

Column name: sentiment_class
classification_report: 
               precision    recall  f1-score   support

    Negative   0.720270  0.457118  0.559286      1166
     Neutral   0.687564  0.845231  0.758288      1583
    Positive   0.878316  0.898336  0.888213      2764

    accuracy                       0.789770      5513
   macro avg   0.762050  0.733562  0.735263      5513
weighted avg   0.790117  0.789770  0.781339      5513

Column name: category_clas

In [13]:
# store the model as pickle object
pickle.dump(model, open("output/model/ABC_final.pkl", 'wb'))

In [14]:
# load the model from disk
loaded_model = pickle.load(open("output/model/ABC_final.pkl", 'rb'))

In [15]:
print("Validation of models :")

Validation of models :


In [16]:
print("text is: \n",X_val[:2] )

text is: 
 ['element present story somewhat weak nice segue event avenger disassemble'
 'fiction sale read like stop sale promotion logic difficult follow many point valid actual material present could cover page much repetition conclusion prove market wrong write another edition think discredit amazon associate poor quality also base hour discussion club recently']


In [17]:
print("Actual result is: \n", Y_val[:2])

Actual result is: 
 [['4.0' 'Neutral' 'product']
 ['1.0' 'Negative' 'product']]


In [18]:
res=loaded_model.predict(X_val[:2])
print("Predicted result is: \n", res[:2])

Predicted result is: 
 [['3.0' 'Neutral' 'product']
 ['2.0' 'Negative' 'product']]
