In [1]:
# General Imports
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm,trange
import time


# Classifier Imports
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('output/Amazon_reviews_mid_f1.csv', dtype=str,)
data.head()

Unnamed: 0,text,star_rating,sentiment_class,category_class
0,choose right list item beautiful intricate det...,5.0,Positive,product
1,sorry bill work downtown washington november j...,1.0,Negative,product
2,great taste start say book amazing world good ...,5.0,Positive,product
3,beyond religion endorsement support another su...,5.0,Positive,product
4,unique work generation edit would point ridicu...,5.0,Positive,product


In [3]:
X = data["text"].values
category_names = ['star_rating','sentiment_class', 'category_class']
Y = data[category_names].values

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1 )
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1)

In [5]:
print(f"X_train: {X_train.shape} | X_val: {X_val.shape} | X_test: {X_test.shape} | \n" + 
          f"y_train: {Y_train.shape} | y_val: {Y_val.shape} | y_test: {Y_test.shape} | ")

X_train: (44651,) | X_val: (4962,) | X_test: (5513,) | 
y_train: (44651, 3) | y_val: (4962, 3) | y_test: (5513, 3) | 


In [6]:
pipeline = Pipeline([('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('clf',  MultiOutputClassifier(DecisionTreeClassifier()))])


In [7]:
    parameters ={
                  'clf__estimator__min_samples_leaf': [10, 20, 30, 40, 50],#40
                   'clf__estimator__max_depth': [50, 100, 150] #100
                } 

In [8]:
model = GridSearchCV(pipeline, param_grid=parameters)

In [9]:
model.fit(X_train, Y_train)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer()),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())]))])),
                                       ('clf',
                                        MultiOutputClassifier(estimator=DecisionTreeClassifier()))]),
             param_grid={'clf__estimator__max_depth': [50, 100, 150],
                         'clf__estimator__min_samples_leaf': [10, 20, 30, 40,
                                                              50]})

In [10]:
Y_pred = model.predict(X_test)

In [11]:
print("Best Parameters:", model.best_params_)

Best Parameters: {'clf__estimator__max_depth': 100, 'clf__estimator__min_samples_leaf': 40}


In [19]:
for column in range(Y_test.shape[1]):
    print("Column name: {}".format(category_names[column]))
    print("classification_report: \n",classification_report(Y_test[:,column],Y_pred[:,column],digits=6))
#     print("confusion_matrix: \n",confusion_matrix(Y_test[:,column],Y_pred[:,column]))

Column name: star_rating
classification_report: 
               precision    recall  f1-score   support

         1.0   0.431655  0.295567  0.350877       609
         2.0   0.395105  0.422430  0.408311      1070
         3.0   0.328612  0.228346  0.269454       508
         4.0   0.451208  0.519755  0.483062      1797
         5.0   0.567037  0.567037  0.567037      1529

    accuracy                       0.462362      5513
   macro avg   0.434723  0.406627  0.415748      5513
weighted avg   0.458987  0.462362  0.457559      5513

Column name: sentiment_class
classification_report: 
               precision    recall  f1-score   support

    Negative   0.607495  0.517212  0.558730      1191
     Neutral   0.796089  0.755467  0.775247      1509
    Positive   0.820020  0.894063  0.855442      2813

    accuracy                       0.774714      5513
   macro avg   0.741201  0.722248  0.729806      5513
weighted avg   0.767557  0.774714  0.769391      5513

Column name: category_clas

In [13]:
# store the model as pickle object
pickle.dump(model, open("output/model/DTC_final.pkl", 'wb'))

In [14]:
# load the model from disk
loaded_model = pickle.load(open("output/model/DTC_final.pkl", 'rb'))

In [15]:
print("Validation of models :")

Validation of models :


In [16]:
print("text is: \n",X_val[:2] )

text is: 
 ['laminate give star road name maybe make much difference street sign'
 'road learn hop information beginner level way build different thing stage small bite eat entire elephant']


In [17]:
print("Actual result is: \n", Y_val[:2])

Actual result is: 
 [['4.0' 'Neutral' 'product']
 ['3.0' 'Neutral' 'product']]


In [18]:
res=loaded_model.predict(X_val[:2])
print("Predicted result is: \n", res[:2])

Predicted result is: 
 [['3.0' 'Neutral' 'product']
 ['3.0' 'Neutral' 'product']]
