In [1]:
# General Imports
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm,trange
import time


# Classifier Imports
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('output/Amazon_reviews_mid_f1.csv', dtype=str,)
data.head()

Unnamed: 0,text,star_rating,sentiment_class,category_class
0,choose right list item beautiful intricate det...,5.0,Positive,product
1,sorry bill work downtown washington november j...,1.0,Negative,product
2,great taste start say book amazing world good ...,5.0,Positive,product
3,beyond religion endorsement support another su...,5.0,Positive,product
4,unique work generation edit would point ridicu...,5.0,Positive,product


In [3]:
X = data["text"].values
category_names = ['star_rating','sentiment_class', 'category_class']
Y = data[category_names].values

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1 )
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1)

In [5]:
print(f"X_train: {X_train.shape} | X_val: {X_val.shape} | X_test: {X_test.shape} | \n" + 
          f"y_train: {Y_train.shape} | y_val: {Y_val.shape} | y_test: {Y_test.shape} | ")

X_train: (44651,) | X_val: (4962,) | X_test: (5513,) | 
y_train: (44651, 3) | y_val: (4962, 3) | y_test: (5513, 3) | 


In [8]:
model = Pipeline([('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('clf', MultiOutputClassifier(estimator = SVC(gamma="scale")))])

In [9]:
model.fit(X_train, Y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer()),
                                                                 ('tfidf',
                                                                  TfidfTransformer())]))])),
                ('clf', MultiOutputClassifier(estimator=SVC()))])

In [10]:
Y_pred = model.predict(X_test)

In [18]:
for column in range(Y_test.shape[1]):
    print("Column name: {}".format(category_names[column]))
    print("classification_report: \n",classification_report(Y_test[:,column],Y_pred[:,column],digits=6))
#     print("confusion_matrix: \n",confusion_matrix(Y_test[:,column],Y_pred[:,column]))

Column name: star_rating
classification_report: 
               precision    recall  f1-score   support

         1.0   0.655087  0.463158  0.542652       570
         2.0   0.541298  0.643295  0.587905      1141
         3.0   0.427350  0.103306  0.166389       484
         4.0   0.584836  0.685637  0.631238      1845
         5.0   0.706242  0.706721  0.706481      1473

    accuracy                       0.608380      5513
   macro avg   0.582963  0.520423  0.526933      5513
weighted avg   0.601700  0.608380  0.592404      5513

Column name: sentiment_class
classification_report: 
               precision    recall  f1-score   support

    Negative   0.807163  0.745547  0.775132      1179
     Neutral   0.805521  0.826255  0.815756      1554
    Positive   0.940283  0.957194  0.948663      2780

    accuracy                       0.875023      5513
   macro avg   0.850989  0.842999  0.846517      5513
weighted avg   0.873827  0.875023  0.874088      5513

Column name: category_clas

In [12]:
# store the model as pickle object
pickle.dump(model, open("output/model/SVC_final.pkl", 'wb'))

In [6]:
# load the model from disk
loaded_model = pickle.load(open("output/model/SVC_final.pkl", 'rb'))

In [7]:
print("Validation of models :")

Validation of models :


In [13]:
print("text is: \n",X_val[1] )

text is: 
 unfortunately content rather bland lack substantive research recommendation every chapter sound


In [14]:
print("Actual result is: \n", Y_val[1])

Actual result is: 
 ['2.0' 'Negative' 'product']


In [15]:
res=loaded_model.predict(X_val[:2])
print("Predicted result is: \n", res[1])

Predicted result is: 
 ['2.0' 'Negative' 'product']


In [25]:
res=loaded_model.predict(["This product is great, delivery got delayed, packaging was average"])

In [26]:
print("Predicted result is: \n", res[0])

Predicted result is: 
 ['4.0' 'Positive' 'delivery']
