In [1]:
# General Imports
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm,trange
import time


# Classifier Imports
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier 

In [2]:
data = pd.read_csv('output/Amazon_reviews_mid_f1.csv', dtype=str,)
data.head()

Unnamed: 0,text,star_rating,sentiment_class,category_class
0,choose right list item beautiful intricate det...,5.0,Positive,product
1,sorry bill work downtown washington november j...,1.0,Negative,product
2,great taste start say book amazing world good ...,5.0,Positive,product
3,beyond religion endorsement support another su...,5.0,Positive,product
4,unique work generation edit would point ridicu...,5.0,Positive,product


In [3]:
X = data["text"].values
category_names = ['star_rating','sentiment_class', 'category_class']
Y = data[category_names].values

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1 )
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1)

In [5]:
print(f"X_train: {X_train.shape} | X_val: {X_val.shape} | X_test: {X_test.shape} | \n" + 
          f"y_train: {Y_train.shape} | y_val: {Y_val.shape} | y_test: {Y_test.shape} | ")

X_train: (44651,) | X_val: (4962,) | X_test: (5513,) | 
y_train: (44651, 3) | y_val: (4962, 3) | y_test: (5513, 3) | 


In [12]:
pipeline = Pipeline([('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('clf', MultiOutputClassifier(estimator = RandomForestClassifier()))
    ])


In [13]:
parameters = {'clf__estimator__n_estimators': [10, 20, 30, 40, 50]}

In [14]:
model = GridSearchCV(pipeline, param_grid=parameters, n_jobs=4)

In [15]:
model.fit(X_train, Y_train)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer()),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())]))])),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             n_jobs=4,
             param_grid={'clf__estimator__n_estimators': [10, 20, 30, 40, 50]})

In [16]:
Y_pred = model.predict(X_test)

In [17]:
print("Best Parameters:", model.best_params_)

Best Parameters: {'clf__estimator__n_estimators': 50}


In [25]:
for column in range(Y_test.shape[1]):
    print("Column name: {}".format(category_names[column]))
    print("classification_report: \n",classification_report(Y_test[:,column],Y_pred[:,column],digits=6))
#     print("confusion_matrix: \n",confusion_matrix(Y_test[:,column],Y_pred[:,column]))

Column name: star_rating
classification_report: 
               precision    recall  f1-score   support

         1.0   0.658683  0.176565  0.278481       623
         2.0   0.434160  0.441319  0.437710      1031
         3.0   0.430769  0.101449  0.164223       552
         4.0   0.443775  0.614230  0.515272      1799
         5.0   0.577473  0.642573  0.608286      1508

    accuracy                       0.488845      5513
   macro avg   0.508972  0.395227  0.400794      5513
weighted avg   0.501532  0.488845  0.464301      5513

Column name: sentiment_class
classification_report: 
               precision    recall  f1-score   support

    Negative   0.865882  0.313725  0.460576      1173
     Neutral   0.789346  0.838046  0.812968      1556
    Positive   0.786088  0.970187  0.868489      2784

    accuracy                       0.793216      5513
   macro avg   0.813772  0.707320  0.714011      5513
weighted avg   0.803986  0.793216  0.766027      5513

Column name: category_clas

In [19]:
# store the model as pickle object
pickle.dump(model, open("output/model/RFC_final.pkl", 'wb'))

In [20]:
# load the model from disk
loaded_model = pickle.load(open("output/model/RFC_final.pkl", 'rb'))

In [21]:
print("Validation of models :")

Validation of models :


In [22]:
print("text is: \n",X_val[:2] )

text is: 
 ['awesome complex crime thriller book dense great complex character awesome plot twist keep late night turn page absolute pleasure read kenyon tough small town sheriff detective single mother girl call upon investigate entire family nearby farmhouse discovers adopt miss becomes prime suspect handle go farmhouse subsequently destroy tornado daughter jenna miss suspect investigation desperate search daughter intensifies uncovers twist lie past murder center ruthless serial killer novel well write deftly handle plot twist plenty master flesh various character psychology motivation write woman incredibly well embitter haunt compassionate likeable kenyon serial killer groupie bonnie tina show later book flashback incredible everything come together puts brilliant clever twist suicide story thing seem long time crime thriller reader find novel incredibly satisfy cannot wait read rest fiction'
 'close expect finger chart year regular write need know read disappointed']


In [23]:
print("Actual result is: \n", Y_val[:2])

Actual result is: 
 [['5.0' 'Positive' 'product']
 ['2.0' 'Negative' 'product']]


In [24]:
res=loaded_model.predict(X_val[:2])
print("Predicted result is: \n", res[:2])

Predicted result is: 
 [['5.0' 'Positive' 'product']
 ['4.0' 'Neutral' 'product']]
