In [1]:
#import required packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from sklearn import linear_model
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split , StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
#from textblob import TextBlob
#from textblob import Word
#from bs4 import BeautifulSoup
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.externals import joblib
#from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
stops={}

In [3]:
#to preprocess the text

def text_preprocessing(text, language, minWordSize):
    
    # remove html
    #text_html = BeautifulSoup(text,"html.parser" ).get_text()
    
    # remove non-letters
    text_non_letters = re.sub("[^a-zA-Z']", " ", text) 
        
    # convert to lower-case
    text_lower = text_non_letters.lower()
    
    # remove stop words
    text_no_stop_words = ' '
    for w in text_lower.split():
        if w not in stops:  
            text_no_stop_words = text_no_stop_words + w + ' '  
    
    # do stemming
    text_stemmer = ' '
    stemmer = SnowballStemmer(language)
    for w in text_no_stop_words.split():
        text_stemmer = text_stemmer + stemmer.stem(w) + ' '
         
    # remove short words
    text_no_short_words = ' '
    for w in text_stemmer.split(): 
        if len(w) >=minWordSize:
            text_no_short_words = text_no_short_words + w + ' '
    return text_no_short_words

In [4]:
#load and prepare old data
df_reviews = pd.read_csv('Input_data/Old_data_reviews.csv',sep=',',quotechar='"' ) #contains the reviews from Facebook and Tripadvisor
df_cat_labels = pd.read_csv('Input_data/Old_data_categorisation_labels.csv',sep=',',quotechar='"') # contains the classlabels of the reviews
df_merged_data = pd.merge(df_cat_labels,df_reviews,how='left',left_on='review_id',right_on='id')

# Drop unnecessary columns
df_merged_data = df_merged_data.drop(['labeler_id', 'post_type', 'datetime_posted','likes', 'traveler_type',
                    'rating_food','rating_service', 'rating_environment', 'rating_value',
                     'reviewer_id','source_subject_id','id_x','review_id','id_y','source'],axis=1)

#select english reviews
olddata =  df_merged_data.loc[df_merged_data['language'] == 'en']
reviews_old = olddata['text'].values
sentiment_old=olddata['rating'].values

In [5]:
#load and prepare new data
newdata=pd.read_csv('Input_data/New_data.csv')
reviews_new= newdata['Text'].values
sentiment_new = newdata['Sentiment'].values

#Split category column 
newdata['category']=newdata['Categories'].apply(lambda x:x.split(';'))

def category_includer(data,string):    
    for i in range(len(data)):
        data.loc[i,string]=0
        column=list(data.loc[i,'category'])
        if string in column:
            data.loc[i,string]=1
    return data

newdata=category_includer(newdata,'experience')
newdata=category_includer(newdata,'service')
newdata=category_includer(newdata,'consistency')
newdata=category_includer(newdata,'value')
newdata=category_includer(newdata,'food')
newdata=category_includer(newdata,'convenience')

#dropping the two columns and filter only English
newdata=newdata.drop(['Categories','category'],axis=1)
newdata=newdata[newdata['Language']=="eng"]

In [6]:
#define the models, parameters for grid search and their names.
"""
#List of all models in reasonably large range
models = [ LogisticRegression(class_weight='balanced'),
     SVC(class_weight='balanced'),
     RandomForestClassifier(class_weight='balanced',random_state=0),
     GradientBoostingClassifier(),
     MultinomialNB()
         ]
params = [ 
     { 'C': [0.01,0.1, 1, 10, 100, 1000, 10000, 100000, 1000000]},
     [
        {'kernel': ['linear'], 'C': np.linspace(0.1,10,5)},
        {'kernel': ['rbf'], 'C': np.linspace(0.1,10,5), 'gamma': np.linspace(0.001,1,5)},
    ],
    {'max_depth':list(range(5,15,5)),'n_estimators':[100,200]},
    {'n_estimators':[100,200],'learning_rate':[0.01,0.02],'max_depth':list(range(1,15,3))},
    {}
    
]

#1 example model each for testing
"""
models = [ LogisticRegression(class_weight='balanced'),
     SVC(class_weight='balanced'),
     RandomForestClassifier(class_weight='balanced',random_state=0),
     GradientBoostingClassifier(),
     MultinomialNB()
         ]
params = [ 
     { 'C': [0.01]},
     [
        #{'kernel': ['linear'], 'C': np.linspace(0.1,10,5)},
        {'kernel': ['rbf'], 'C': [2.3], 'gamma': [1]},
    ],
    {'max_depth':[5],'n_estimators':[100]},
    {'n_estimators':[100],'learning_rate':[0.01],'max_depth':[7]},
    {}
    
]

names=['LogisticRegression','SVM','RandomForest','GradientBoostingClassifier','MultinomialNB']
#the function iterates for all the models and selects the model with the highest score for the test set. The model is fitted on the training set.



def select_best_model(X_train,y_train,X_test,y_test):
    param_matrix={}
    score=0
    for model,param,name in zip(models,params,names):
        gs_model=GridSearchCV(model,param,refit=True,n_jobs=-1,verbose=25)
        print(name)
        gs_model.fit(X_train,y_train)
        predictions=gs_model.predict(X_test)
        param_matrix[name]=accuracy_score(y_test,predictions)    
        if accuracy_score(y_test,predictions)>score:
            score=accuracy_score(y_test,predictions)
            best_model=gs_model
    return best_model
        

In [7]:
#training on old and testing on the old data

#test train split
text_train,text_test,sentiment_train,sentiment_test= train_test_split(reviews_old,sentiment_old,random_state=0,test_size=0.2,stratify=sentiment_old)

#bag of words
language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(text_train)):
    text_train[i] = text_preprocessing(text_train[i], language, minWordLength)
for i in range(len(text_test)):
    text_test[i] = text_preprocessing(text_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(text_train)
text_train_bow = count_vect.transform(text_train)
text_test_bow = count_vect.transform(text_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)
joblib.dump(count_vect,'Exported_models/count_vectorizer_oldtrain_oldtest_sentiment.pkl')
joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_oldtrain_oldtest_sentiment.pkl')
    
#training the models
print('training to find sentiment'+'\n')
best_model_reviews = select_best_model(reviews_bow_train,sentiment_train,reviews_bow_test,sentiment_test)

#testing the models
print('Testing on sentiment prediction'+ '\n\n'+ 'best parameters:')
print(best_model_reviews.best_estimator_)
print('\n')
print('Classification report:')
y_pred_reviews = best_model_reviews.predict(reviews_bow_test)

print(classification_report(sentiment_test, y_pred_reviews))

cf_sentiment = confusion_matrix(sentiment_test, y_pred_reviews)
print(cf_sentiment)
print(accuracy_score(sentiment_test, y_pred_reviews) * 100)

#joblib.dump(best_model_reviews,'Exported_models/train_old_test_old_sentiment.pkl')
#joblib.dump(best_model_reviews.best_estimator_, 'Exported_models/train_new_test_new_sentiment_bestmodel.pkl')


training to find sentiment

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.7s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   51.4s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   52.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   52.4s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.1min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Testing on sentiment prediction

best parameters:
SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0895s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


              precision    recall  f1-score   support

         1.0       0.93      0.93      0.93        30
         2.0       1.00      0.85      0.92        47
         3.0       1.00      0.73      0.84       129
         4.0       0.89      0.85      0.87       429
         5.0       0.94      0.98      0.96      1392

    accuracy                           0.93      2027
   macro avg       0.95      0.87      0.91      2027
weighted avg       0.94      0.93      0.93      2027

[[  28    0    0    2    0]
 [   0   40    0    3    4]
 [   2    0   94   16   17]
 [   0    0    0  364   65]
 [   0    0    0   23 1369]]
93.48791317217562


In [8]:
# training and testing on new data

#reviews and sentiment as values
reviews_new= newdata['Text'].values
sentiment_new = newdata['Sentiment'].values

#test train split
reviews_train, reviews_test, sentiment_train, sentiment_test = train_test_split(reviews_new, sentiment_new, test_size=0.2,random_state=0,stratify=sentiment_new)

#bag of words
language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(reviews_train)):
    reviews_train[i] = text_preprocessing(reviews_train[i], language, minWordLength)
for i in range(len(reviews_test)):
    reviews_test[i] = text_preprocessing(reviews_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(reviews_train)
text_train_bow = count_vect.transform(reviews_train)
text_test_bow = count_vect.transform(reviews_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)
joblib.dump(count_vect,'Exported_models/count_vectorizer_newtrain_newtest_sentiment.pkl')
joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_newtrain_newtest_sentiment.pkl')

#training the model
print('training to find sentiment'+'\n')
best_model_reviews = select_best_model(reviews_bow_train,sentiment_train,reviews_bow_test,sentiment_test)

#testing the models
print('Testing on sentiment prediction'+ '\n\n'+ 'best parameters:')
print(best_model_reviews.best_estimator_)
print('\n')
print('Classification report:')
y_pred_reviews = best_model_reviews.predict(reviews_bow_test)

print(classification_report(sentiment_test, y_pred_reviews))

cf_sentiment = confusion_matrix(sentiment_test, y_pred_reviews)
print(cf_sentiment)
print(accuracy_score(sentiment_test, y_pred_reviews) * 100)

#Exporting the models
joblib.dump(best_model_reviews.best_estimator_, 'Exported_models/train_new_test_new_sentiment_bestmodel.pkl')
joblib.dump(best_model_reviews, 'Exported_models/train_new_test_new_sentiment.pkl')

training to find sentiment

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0909s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    6.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    6.7s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   59.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   59.2s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Testing on sentiment prediction

best parameters:
SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0407s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


              precision    recall  f1-score   support

          -1       0.82      0.79      0.81       177
           0       0.88      0.23      0.37        60
           1       0.92      0.98      0.95       749

    accuracy                           0.90       986
   macro avg       0.87      0.67      0.71       986
weighted avg       0.90      0.90      0.89       986

[[140   1  36]
 [ 16  14  30]
 [ 14   1 734]]
90.06085192697769


['Exported_models/train_new_test_new_sentiment.pkl']

In [9]:

# 1,2,3,4,5 as -1,0,1 old data rating from 1,2,3,4,5 to -1,0,1

for i in range(len(olddata)):
    if olddata.loc[i,'rating'] in [1,2]:
        olddata.loc[i,'rating']=-1
    elif olddata.loc[i,'rating']==3:
        olddata.loc[i,'rating']=0
    elif olddata.loc[i,'rating'] in [4,5]:
        olddata.loc[i,'rating']=1

In [10]:
#training on old and testing on new data

#reviews as arrays
text_train = olddata['text'].values
text_test = newdata['Text'].values

#sentiment as arrays
sentiment_train = olddata['rating'].values
sentiment_test = newdata['Sentiment'].values

#bag of words
language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(text_train)):
    text_train[i] = text_preprocessing(text_train[i], language, minWordLength)
for i in range(len(text_test)):
    text_test[i] = text_preprocessing(text_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(text_train)
text_train_bow = count_vect.transform(text_train)
text_test_bow = count_vect.transform(text_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)
#joblib.dump(count_vect,'Exported_models/count_vectorizer_oldtrain_newtest_sentiment.pkl')
#joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_oldtrain_newtest_sentiment.pkl')

#training the model
print('training to find sentiment'+'\n')
best_model_reviews = select_best_model(reviews_bow_train,sentiment_train,reviews_bow_test,sentiment_test)

#testing the model
print('Testing on sentiment prediction'+ '\n\n'+ 'best parameters:')
print(best_model_reviews.best_estimator_)
print('\n')
print('Classification report:')
y_pred_reviews = best_model_reviews.predict(reviews_bow_test)

print(classification_report(sentiment_test, y_pred_reviews))

cf_sentiment = confusion_matrix(sentiment_test, y_pred_reviews)
print(cf_sentiment)
print(accuracy_score(sentiment_test, y_pred_reviews) * 100)

#Exporting the model
#joblib.dump(best_model_reviews,'Exported_models/train_old_test_new_sentiment.pkl')
#joblib.dump(best_model_reviews.best_estimator_, 'Exported_models/train_old_test_new_sentiment_bestmodel.pkl')


training to find sentiment

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   40.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   40.3s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.9min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Testing on sentiment prediction

best parameters:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)


Classification report:


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0872s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


              precision    recall  f1-score   support

          -1       0.85      0.34      0.48       883
           0       0.09      0.11      0.10       302
           1       0.84      0.94      0.89      3745

    accuracy                           0.78      4930
   macro avg       0.59      0.46      0.49      4930
weighted avg       0.80      0.78      0.77      4930

[[ 299  154  430]
 [  18   34  250]
 [  34  180 3531]]
78.37728194726166


In [11]:
#training on old+67% new and testing on 33% new

#selecting required columns from new and old data
newdata1= newdata[['Text','Sentiment']].copy()
olddata1=olddata[['text','rating']].copy()

#naming old data columns
olddata1['Sentiment']=olddata1['rating']
olddata1['Text']=olddata1['text']
#dropping the old columns
olddata1 = olddata1.drop(['rating','text'],axis=1)

#splitting the test data into 67% for training and rest for testing
train1,train2,test1,test2= train_test_split(newdata,newdata['Sentiment'],random_state=0,test_size=0.33,stratify=newdata['Sentiment'])

#joining the old and 67%new data. Note train1 and train2 are just names
joineddata=pd.concat([olddata1,train1],axis=0)

#creating arrays of train and test
text_train= joineddata['Text'].values
text_test=train2['Text'].values
sentiment_train = joineddata['Sentiment'].values
sentiment_test= train2['Sentiment'].values

#defining bag of words
language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(text_train)):
    text_train[i] = text_preprocessing(text_train[i], language, minWordLength)
for i in range(len(text_test)):
    text_test[i] = text_preprocessing(text_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(text_train)
text_train_bow = count_vect.transform(text_train)
text_test_bow = count_vect.transform(text_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)
joblib.dump(count_vect,'Exported_models/count_vectorizer_oldnewtrain_newtest_sentiment.pkl')
joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_oldnewtrain_newtest_sentiment.pkl')

#training the model
print('training to find sentiment'+'\n')
best_model_reviews = select_best_model(reviews_bow_train,sentiment_train,reviews_bow_test,sentiment_test)

#testing the model
print('Testing on sentiment prediction'+ '\n\n'+ 'best parameters:')
print(best_model_reviews.best_estimator_)
print('\n')
print('Classification report:')
y_pred_reviews = best_model_reviews.predict(reviews_bow_test)

print(classification_report(sentiment_test, y_pred_reviews))

cf_sentiment = confusion_matrix(sentiment_test, y_pred_reviews)
print(cf_sentiment)
print(accuracy_score(sentiment_test, y_pred_reviews) * 100)

#Exporting the model
#joblib.dump(best_model_reviews,'Exported_models/train_oldnew_test_new_sentiment.pkl')
#joblib.dump(best_model_reviews.best_estimator_, 'Exported_models/train_oldnew_test_new_sentiment_bestmodel.pkl')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




training to find sentiment

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.5s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.9s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.7min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Testing on sentiment prediction

best parameters:
SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0887s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


              precision    recall  f1-score   support

          -1       0.87      0.61      0.72       291
           0       0.79      0.30      0.43       100
           1       0.88      0.98      0.93      1236

    accuracy                           0.88      1627
   macro avg       0.85      0.63      0.69      1627
weighted avg       0.87      0.88      0.86      1627

[[ 177    2  112]
 [  13   30   57]
 [  13    6 1217]]
87.52304855562384


In [12]:
#training and testing on the old data




# Split in training set and test set

random_state = 0
test_size = 0.2 # test set is 20% of the total data set
text_train,text_test,y_train,y_test=train_test_split(olddata['text'].values,olddata[['sentiment_food','sentiment_environment','sentiment_value','sentiment_service','sentiment_consistency']],random_state=0,test_size=0.2)


# Make bag of words feature vectors
language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(text_train)):
    text_train[i] = text_preprocessing(text_train[i], language, minWordLength)
for i in range(len(text_test)):
    text_test[i] = text_preprocessing(text_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(text_train)
text_train_bow = count_vect.transform(text_train)
text_test_bow = count_vect.transform(text_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)
joblib.dump(count_vect,'Exported_models/count_vectorizer_oldtrain_oldtest_classification.pkl')
joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_oldtrain_oldtest_classification.pkl')


#training for the various choice drivers
print('Training model on choice driver food'+'\n')
best_model_food=select_best_model(reviews_bow_train, y_train['sentiment_food'],reviews_bow_test,y_test['sentiment_food'])
print('Training model on choice driver environment' +'\n')
best_model_environment=select_best_model(reviews_bow_train, y_train['sentiment_environment'],reviews_bow_test,y_test['sentiment_environment'])
print('Training model on choice driver value' +'\n')
best_model_value = select_best_model(reviews_bow_train, y_train['sentiment_value'],reviews_bow_test,y_test['sentiment_value'])
print('Training model on choice driver service' +'\n')
best_model_service = select_best_model(reviews_bow_train, y_train['sentiment_service'],reviews_bow_test,y_test['sentiment_service'])
print('Training model on choice driver consistency' +'\n')
best_model_consistency =select_best_model(reviews_bow_train, y_train['sentiment_consistency'],reviews_bow_test,y_test['sentiment_consistency'])

# Testing on test set
print('Testing on choice driver food'+ '\n\n' + 'best parameters:')
print( best_model_food.best_estimator_ )
print('\n')
y_test_food = y_test['sentiment_food'].values
print('Classification report:')
y_pred_food = best_model_food.predict(reviews_bow_test)
print(classification_report(y_test_food, y_pred_food))
cf_food = confusion_matrix(y_test_food, y_pred_food)
print(cf_food)
print(accuracy_score(y_test_food, y_pred_food) * 100)

print('---------------------------------------------')
y_test_environment = y_test['sentiment_environment'].values
print('Testing on choice driver environment'+ '\n\n'+ 'best parameters:')
print(best_model_environment.best_estimator_)
print('\n')
print('Classification report:')
y_pred_environment = best_model_environment.predict(reviews_bow_test)
print(classification_report(y_test_environment, y_pred_environment))
cf_environment = confusion_matrix(y_test_environment, y_pred_environment)
print(cf_environment)
print(accuracy_score(y_test_environment, y_pred_environment) * 100)

print('---------------------------------------------')

print('Testing on choice driver value'+ '\n\n'+ 'best parameters:')
y_test_value = y_test['sentiment_value'].values
print(best_model_value.best_estimator_)
print('\n')
print('Classification report:')
y_pred_value = best_model_value.predict(reviews_bow_test)
print(classification_report(y_test_value, y_pred_value))
cf_value = confusion_matrix(y_test_value, y_pred_value)
print(cf_value)
print(accuracy_score(y_test_value, y_pred_value) * 100)

print('---------------------------------------------')

print('Testing on choice driver service'+ '\n\n'+ 'best parameters:')
y_test_service = y_test['sentiment_service'].values
print(best_model_service.best_estimator_)
print('\n')
print('Classification report:')
y_pred_service = best_model_service.predict(reviews_bow_test)
print(classification_report(y_test_service, y_pred_service))
cf_service = confusion_matrix(y_test_service, y_pred_service)
print(cf_service)
print(accuracy_score(y_test_service, y_pred_service) * 100)

print('---------------------------------------------')

print('Testing on choice driver consistency'+ '\n\n'+ 'best parameters:')
y_test_consistency = y_test['sentiment_consistency'].values
print(best_model_consistency.best_estimator_)
print('\n')
print('Classification report:')
y_pred_consistency = best_model_consistency.predict(reviews_bow_test)
print(classification_report(y_test_consistency, y_pred_consistency))
cf_consistency = confusion_matrix(y_test_consistency, y_pred_consistency)
print(cf_consistency)
print(accuracy_score(y_test_consistency, y_pred_consistency) * 100)

print('---------------------------------------------')
"""
joblib.dump(best_model_food.best_estimator_, 'Exported_models/train_old_test_old_food_bestmodel.pkl')
joblib.dump(best_model_food, 'Exported_models/train_old_test_old_food.pkl')
joblib.dump(best_model_consistency.best_estimator_, 'Exported_models/train_old_test_old_consistency_bestmodel.pkl')
joblib.dump(best_model_consistency, 'Exported_models/train_new_test_old_consistency.pkl')
joblib.dump(best_model_convenience.best_estimator_, 'Exported_models/train_old_test_old_convenience_bestmodel.pkl')
joblib.dump(best_model_convenience, 'Exported_models/train_old_test_old_convenience.pkl')
joblib.dump(best_model_environment.best_estimator_, 'Exported_models/train_old_test_old_environment_bestmodel.pkl')
joblib.dump(best_model_environment, 'Exported_models/train_old_test_old_environment.pkl')
joblib.dump(best_model_service.best_estimator_, 'Exported_models/train_old_test_old_service_bestmodel.pkl')
joblib.dump(best_model_service, 'Exported_models/train_old_test_old_service.pkl')
joblib.dump(best_model_value.best_estimator_, 'Exported_models/train_old_test_old_value_bestmodel.pkl')
joblib.dump(best_model_value, 'Exported_models/train_old_test_old_value.pkl')
"""

Training model on choice driver food

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   19.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   19.1s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   49.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   49.9s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver environment

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0433s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0970s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   35.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   35.1s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver value

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0429s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0880s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   41.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   41.3s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver service

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0373s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0906s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   28.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   28.7s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   49.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   49.1s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver consistency

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0484s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0761s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   32.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   32.7s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Testing on choice driver food

best parameters:
SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0346s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


              precision    recall  f1-score   support

           0       0.63      0.46      0.53        59
           1       0.98      0.99      0.99      1968

    accuracy                           0.98      2027
   macro avg       0.81      0.72      0.76      2027
weighted avg       0.97      0.98      0.97      2027

[[  27   32]
 [  16 1952]]
97.63196842624569
---------------------------------------------
Testing on choice driver environment

best parameters:
SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       867
           1       0.92      0.90      0.91      1160

    accuracy                           0.89      2027
   macro avg       0.89      0.89      0.89      202

"\njoblib.dump(best_model_food.best_estimator_, 'Exported_models/train_old_test_old_food_bestmodel.pkl')\njoblib.dump(best_model_food, 'Exported_models/train_old_test_old_food.pkl')\njoblib.dump(best_model_consistency.best_estimator_, 'Exported_models/train_old_test_old_consistency_bestmodel.pkl')\njoblib.dump(best_model_consistency, 'Exported_models/train_new_test_old_consistency.pkl')\njoblib.dump(best_model_convenience.best_estimator_, 'Exported_models/train_old_test_old_convenience_bestmodel.pkl')\njoblib.dump(best_model_convenience, 'Exported_models/train_old_test_old_convenience.pkl')\njoblib.dump(best_model_environment.best_estimator_, 'Exported_models/train_old_test_old_environment_bestmodel.pkl')\njoblib.dump(best_model_environment, 'Exported_models/train_old_test_old_environment.pkl')\njoblib.dump(best_model_service.best_estimator_, 'Exported_models/train_old_test_old_service_bestmodel.pkl')\njoblib.dump(best_model_service, 'Exported_models/train_old_test_old_service.pkl')\nj

In [13]:
#training and testing on the new data

#CLassification 

random_state = 0
test_size = 0.2 # test set is 20% of the total data set
text_train,text_test,y_train,y_test=train_test_split(newdata['Text'].values,newdata[['food','experience','value','service','consistency','convenience']],random_state=0,test_size=0.2)


# Make bag of words feature vectors
language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(text_train)):
    text_train[i] = text_preprocessing(text_train[i], language, minWordLength)
for i in range(len(text_test)):
    text_test[i] = text_preprocessing(text_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(text_train)
text_train_bow = count_vect.transform(text_train)
text_test_bow = count_vect.transform(text_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)
joblib.dump(count_vect,'Exported_models/count_vectorizer_newtrain_newtest_classification.pkl')
joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_newtrain_newtest_classification.pkl')

#training for the various choice drivers
print('Training model on choice driver food'+'\n')
best_model_food = select_best_model(reviews_bow_train,y_train['food'].values,reviews_bow_test,y_test['food'].values)
print('Training model on choice driver experience' +'\n')
best_model_experience = select_best_model(reviews_bow_train, y_train['experience'].values,reviews_bow_test,y_test['experience'].values)
print('Training model on choice driver value' +'\n')
best_model_value = select_best_model(reviews_bow_train, y_train['value'].values,reviews_bow_test,y_test['value'].values)
print('Training model on choice driver service' +'\n')
best_model_service = select_best_model(reviews_bow_train, y_train['service'].values,reviews_bow_test,y_test['service'].values)
print('Training model on choice driver convenience' +'\n')
best_model_convenience = select_best_model(reviews_bow_train, y_train['convenience'].values,reviews_bow_test,y_test['convenience'].values)
print('Training model on choice driver consistency' +'\n')
best_model_consistency = select_best_model(reviews_bow_train, y_train['consistency'].values,reviews_bow_test,y_test['consistency'].values)


# Testing on test set
print('Testing on choice driver food'+ '\n\n' + 'best parameters:')
print( best_model_food.best_estimator_ )
print('\n')
y_test_food = y_test['food'].values
print('Classification report:')
y_pred_food = best_model_food.predict(reviews_bow_test)
print(classification_report(y_test_food, y_pred_food))
cf_food = confusion_matrix(y_test_food, y_pred_food)
print(cf_food)
print(accuracy_score(y_test_food, y_pred_food) * 100)

print('---------------------------------------------')
y_test_experience = y_test['experience'].values
print('Testing on choice driver experience'+ '\n\n'+ 'best parameters:')
print(best_model_experience.best_estimator_)
print('\n')
print('Classification report:')
y_pred_experience = best_model_experience.predict(reviews_bow_test)
print(classification_report(y_test_experience, y_pred_experience))
cf_experience = confusion_matrix(y_test_experience, y_pred_experience)
print(cf_experience)
print(accuracy_score(y_test_experience, y_pred_experience) * 100)

print('---------------------------------------------')

print('Testing on choice driver value'+ '\n\n'+ 'best parameters:')
y_test_value = y_test['value'].values
print(best_model_value.best_estimator_)
print('\n')
print('Classification report:')
y_pred_value = best_model_value.predict(reviews_bow_test)
print(classification_report(y_test_value, y_pred_value))
cf_value = confusion_matrix(y_test_value, y_pred_value)
print(cf_value)
print(accuracy_score(y_test_value, y_pred_value) * 100)

print('---------------------------------------------')

print('Testing on choice driver service'+ '\n\n'+ 'best parameters:')
y_test_service = y_test['service'].values
print(best_model_service.best_estimator_)
print('\n')
print('Classification report:')
y_pred_service = best_model_service.predict(reviews_bow_test)
print(classification_report(y_test_service, y_pred_service))
cf_service = confusion_matrix(y_test_service, y_pred_service)
print(cf_service)
print(accuracy_score(y_test_service, y_pred_service) * 100)

print('---------------------------------------------')

print('Testing on choice driver consistency'+ '\n\n'+ 'best parameters:')
y_test_consistency = y_test['consistency'].values
print(best_model_consistency.best_estimator_)
print('\n')
print('Classification report:')
y_pred_consistency = best_model_consistency.predict(reviews_bow_test)
print(classification_report(y_test_consistency, y_pred_consistency))
cf_consistency = confusion_matrix(y_test_consistency, y_pred_consistency)
print(cf_consistency)
print(accuracy_score(y_test_consistency, y_pred_consistency) * 100)

print('---------------------------------------------')



print('Testing on choice driver convenience'+ '\n\n'+ 'best parameters:')
y_test_convenience = y_test['convenience'].values
print(best_model_convenience.best_estimator_)
print('\n')
print('Classification report:')
y_pred_convenience = best_model_convenience.predict(reviews_bow_test)
print(classification_report(y_test_convenience, y_pred_convenience))
cf_convenience = confusion_matrix(y_test_convenience, y_pred_convenience)
print(cf_convenience)
print(accuracy_score(y_test_convenience, y_pred_convenience) * 100)



joblib.dump(best_model_food.best_estimator_, 'Exported_models/train_new_test_new_food_bestmodel.pkl')
joblib.dump(best_model_food, 'Exported_models/train_new_test_new_food.pkl')
joblib.dump(best_model_consistency.best_estimator_, 'Exported_models/train_new_test_new_consistency_bestmodel.pkl')
joblib.dump(best_model_consistency, 'Exported_models/train_new_test_new_consistency.pkl')
joblib.dump(best_model_convenience.best_estimator_, 'Exported_models/train_new_test_new_convenience_bestmodel.pkl')
joblib.dump(best_model_convenience, 'Exported_models/train_new_test_new_convenience.pkl')
joblib.dump(best_model_experience.best_estimator_, 'Exported_models/train_new_test_new_experience_bestmodel.pkl')
joblib.dump(best_model_experience, 'Exported_models/train_new_test_new_experience.pkl')
joblib.dump(best_model_service.best_estimator_, 'Exported_models/train_new_test_new_service_bestmodel.pkl')
joblib.dump(best_model_service, 'Exported_models/train_new_test_new_service.pkl')
joblib.dump(best_model_value.best_estimator_, 'Exported_models/train_new_test_new_value_bestmodel.pkl')
joblib.dump(best_model_value, 'Exported_models/train_new_test_new_value.pkl')


Training model on choice driver food

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0378s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.9s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   22.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   22.1s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver experience

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0297s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0418s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 ou

RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.5s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   19.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   19.0s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver value

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0243s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0329s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 o

RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   18.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   18.8s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver service

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0246s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0309s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 o

RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.7s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   20.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   20.5s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver convenience

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0240s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0392s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 o

RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.6s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   15.1s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver consistency

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0246s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0401s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 ou

RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.4s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   14.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   14.3s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Testing on choice driver food

best parameters:
SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0236s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


              precision    recall  f1-score   support

         0.0       0.77      0.46      0.57       301
         1.0       0.80      0.94      0.86       685

    accuracy                           0.79       986
   macro avg       0.78      0.70      0.72       986
weighted avg       0.79      0.79      0.78       986

[[138 163]
 [ 41 644]]
79.3103448275862
---------------------------------------------
Testing on choice driver experience

best parameters:
SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:
              precision    recall  f1-score   support

         0.0       0.82      0.69      0.75       463
         1.0       0.76      0.87      0.81       523

    accuracy                           0.78       986
   macro avg       0.79      0.78      0.78       986
weig

  'precision', 'predicted', average, warn_for)


['Exported_models/train_new_test_new_value.pkl']

In [14]:
#training on old and testing on new



#selecting text and choice drivers as an array
text_train = olddata['text'].values
text_test = newdata['Text'].values

food_train = olddata['sentiment_food'].values
food_test = newdata['food'].values
consistency_train = olddata['sentiment_consistency'].values
consistency_test = newdata['consistency'].values
convenience_train = olddata['sentiment_convenience'].values
convenience_test = newdata['convenience'].values
value_train = olddata['sentiment_value'].values
value_test = newdata['value'].values
service_train = olddata['sentiment_service'].values
service_test = newdata['service'].values

language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(text_train)):
    text_train[i] = text_preprocessing(text_train[i], language, minWordLength)
for i in range(len(text_test)):
    text_test[i] = text_preprocessing(text_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(text_train)
text_train_bow = count_vect.transform(text_train)
text_test_bow = count_vect.transform(text_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)
joblib.dump(count_vect,'Exported_models/count_vectorizer_oldtrain_newtest_classification.pkl')
joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_oldtrain_newtest_classification.pkl')

#training the models
print('Training model on choice driver food'+'\n')
best_model_food = select_best_model(reviews_bow_train,food_train,reviews_bow_test,food_test)

print('Training model on choice driver value' +'\n')
best_model_value = select_best_model(reviews_bow_train,value_train,reviews_bow_test,value_test)
print('Training model on choice driver service' +'\n')
best_model_service = select_best_model(reviews_bow_train,service_train,reviews_bow_test,service_test)
print('Training model on choice driver convenience' +'\n')
best_model_convenience = select_best_model(reviews_bow_train,convenience_train,reviews_bow_test,convenience_test)
print('Training model on choice driver consistency' +'\n')
best_model_consistency = select_best_model(reviews_bow_train,consistency_train,reviews_bow_test,consistency_test)
#testing the models
print(best_model_food.best_estimator_)
print('\n')
print('Classification report:')
y_pred_food = best_model_food.predict(reviews_bow_test)
print(classification_report(food_test, y_pred_food))
cf_food = confusion_matrix(food_test, y_pred_food)
print(cf_food)
print(accuracy_score(food_test, y_pred_food) * 100)

print('---------------------------------------------')

print(best_model_consistency.best_estimator_)
print('\n')
print('Classification report:')
y_pred_consistency = best_model_consistency.predict(reviews_bow_test)
print(classification_report(consistency_test, y_pred_consistency))
cf_consistency = confusion_matrix(consistency_test, y_pred_consistency)
print(cf_consistency)
print(accuracy_score(consistency_test, y_pred_consistency) * 100)

print('---------------------------------------------')

print(best_model_convenience.best_estimator_)
print('\n')
print('Classification report:')
y_pred_convenience = best_model_convenience.predict(reviews_bow_test)
print(classification_report(convenience_test, y_pred_convenience))
cf_convenience = confusion_matrix(convenience_test, y_pred_convenience)
print(cf_convenience)
print(accuracy_score(convenience_test, y_pred_convenience) * 100)

print('---------------------------------------------')

print(best_model_value.best_estimator_)
print('\n')
print('Classification report:')
y_pred_value = best_model_value.predict(reviews_bow_test)
print(classification_report(value_test, y_pred_value))
cf_value = confusion_matrix(value_test, y_pred_value)
print(cf_value)
print(accuracy_score(value_test, y_pred_value) * 100)

print('---------------------------------------------')

print(best_model_service.best_estimator_)
print('\n')
print('Classification report:')
y_pred_service = best_model_service.predict(reviews_bow_test)
print(classification_report(service_test, y_pred_service))
cf_service = confusion_matrix(service_test, y_pred_service)
print(cf_service)
print(accuracy_score(service_test, y_pred_service) * 100)

print('---------------------------------------------')
"""
joblib.dump(best_model_food.best_estimator_, 'Exported_models/train_old_test_new_food_bestmodel.pkl')
joblib.dump(best_model_food, 'Exported_models/train_old_test_new_food.pkl')
joblib.dump(best_model_consistency.best_estimator_, 'Exported_models/train_old_test_new_consistency_bestmodel.pkl')
joblib.dump(best_model_consistency, 'Exported_models/train_new_test_new_consistency.pkl')
joblib.dump(best_model_convenience.best_estimator_, 'Exported_models/train_old_test_new_convenience_bestmodel.pkl')
joblib.dump(best_model_convenience, 'Exported_models/train_old_test_new_convenience.pkl')
joblib.dump(best_model_service.best_estimator_, 'Exported_models/train_old_test_new_service_bestmodel.pkl')
joblib.dump(best_model_service, 'Exported_models/train_old_test_new_service.pkl')
joblib.dump(best_model_value.best_estimator_, 'Exported_models/train_old_test_new_value_bestmodel.pkl')
joblib.dump(best_model_value, 'Exported_models/train_old_test_new_value.pkl')
"""

Training model on choice driver food

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1637s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   24.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   24.1s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   52.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   52.9s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver value

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0448s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1432s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   44.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   44.3s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.7s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver service

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0657s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1059s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   36.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   36.8s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.1s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.2min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver convenience

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0875s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1397s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   30.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   30.1s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   46.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   46.3s finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver consistency

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0562s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1050s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   42.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   42.5s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.9s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.0min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


Classification report:
              precision    recall  f1-score   support

         0.0       0.62      0.44      0.52      1516
         1.0       0.78      0.88      0.83      3414

    accuracy                           0.74      4930
   macro avg       0.70      0.66      0.67      4930
weighted avg       0.73      0.74      0.73      4930

[[ 673  843]
 [ 418 2996]]
74.42190669371197
---------------------------------------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


Classification report:
              precision    recall  f1-score   support

         0.0 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0492s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         0.0       0.85      0.83      0.84      2518
         1.0       0.83      0.85      0.84      2412

    accuracy                           0.84      4930
   macro avg       0.84      0.84      0.84      4930
weighted avg       0.84      0.84      0.84      4930

[[2099  419]
 [ 370 2042]]
83.99594320486815
---------------------------------------------


"\njoblib.dump(best_model_food.best_estimator_, 'Exported_models/train_old_test_new_food_bestmodel.pkl')\njoblib.dump(best_model_food, 'Exported_models/train_old_test_new_food.pkl')\njoblib.dump(best_model_consistency.best_estimator_, 'Exported_models/train_old_test_new_consistency_bestmodel.pkl')\njoblib.dump(best_model_consistency, 'Exported_models/train_new_test_new_consistency.pkl')\njoblib.dump(best_model_convenience.best_estimator_, 'Exported_models/train_old_test_new_convenience_bestmodel.pkl')\njoblib.dump(best_model_convenience, 'Exported_models/train_old_test_new_convenience.pkl')\njoblib.dump(best_model_service.best_estimator_, 'Exported_models/train_old_test_new_service_bestmodel.pkl')\njoblib.dump(best_model_service, 'Exported_models/train_old_test_new_service.pkl')\njoblib.dump(best_model_value.best_estimator_, 'Exported_models/train_old_test_new_value_bestmodel.pkl')\njoblib.dump(best_model_value, 'Exported_models/train_old_test_new_value.pkl')\n"

In [15]:
#training on old+67% old and testing on 33% new

#selecting the neccessary columns from the old and new data frame
newdata1=newdata[['Text','experience','service','consistency','value','food','convenience','Sentiment']].copy()
olddata1=olddata[['sentiment_food','sentiment_service','sentiment_value','sentiment_consistency','sentiment_convenience','text','rating']].copy()

#splitting the test data into 67% for training and rest for testing
train1,train2,test1,test2= train_test_split(newdata1,newdata1['Sentiment'],random_state=0,test_size=0.33)

#renaming columns of the old data to prevent problems during merging of the data frames
olddata1['food']=olddata1['sentiment_food']
olddata1['value']=olddata1['sentiment_value']
olddata1['consistency']=olddata1['sentiment_consistency']
olddata1['service']=olddata1['sentiment_service']
olddata1['convenience']=olddata1['sentiment_convenience']
olddata1['Text']=olddata1['text']
olddata1['Sentiment']=olddata1['rating']

#selecting the required columns again
olddata1=olddata1[['food','consistency','value','convenience','service','Text','Sentiment']]

#drop the experience column
train1=train1.drop(['experience'],axis=1)

#join the old data and the 67% training set from the new data
joineddata=pd.concat([olddata1,train1],axis=0)
#creating arrays from the choice drivers and reviews
text_train= joineddata['Text'].values
text_test=train2['Text'].values


food_train= joineddata['food'].values
food_test=train2['food'].values

consistency_train= joineddata['consistency'].values
consistency_test=train2['consistency'].values

convenience_train= joineddata['convenience'].values
convenience_test=train2['convenience'].values

value_train= joineddata['value'].values
value_test=train2['value'].values

service_train= joineddata['service'].values
service_test=train2['service'].values

#bag of words
language = 'english'
minWordLength = 2 # shorter words will be removed

for i in range(len(text_train)):
    text_train[i] = text_preprocessing(text_train[i], language, minWordLength)
for i in range(len(text_test)):
    text_test[i] = text_preprocessing(text_test[i], language, minWordLength)
        
count_vect = CountVectorizer(ngram_range=(1,3),max_features=20000)
tfidf_transformer = TfidfTransformer(use_idf=True)
    
count_vect.fit(text_train)
text_train_bow = count_vect.transform(text_train)
text_test_bow = count_vect.transform(text_test)
    
tfidf_transformer.fit_transform(text_train_bow)
reviews_bow_train = tfidf_transformer.transform(text_train_bow)
reviews_bow_test = tfidf_transformer.transform(text_test_bow)

joblib.dump(count_vect,'Exported_models/count_vectorizer_oldnewtrain_newtest_classification.pkl')
joblib.dump(tfidf_transformer,'Exported_models/tfidftransformer_oldnewtrain_newtest_classification.pkl')

#training the models
print('Training model on choice driver food'+'\n')
best_model_food = select_best_model(reviews_bow_train,food_train,reviews_bow_test,food_test)

print('Training model on choice driver value' +'\n')
best_model_value = select_best_model(reviews_bow_train,value_train,reviews_bow_test,value_test)
print('Training model on choice driver service' +'\n')
best_model_service = select_best_model(reviews_bow_train,service_train,reviews_bow_test,service_test)
print('Training model on choice driver convenience' +'\n')
best_model_convenience = select_best_model(reviews_bow_train,convenience_train,reviews_bow_test,convenience_test)
print('Training model on choice driver consistency' +'\n')
best_model_consistency = select_best_model(reviews_bow_train,consistency_train,reviews_bow_test,consistency_test)

#testing on the test data

print(best_model_food.best_estimator_)
print('\n')
print('Classification report:')
y_pred_food = best_model_food.predict(reviews_bow_test)
print(classification_report(food_test, y_pred_food))
cf_food = confusion_matrix(food_test, y_pred_food)
print(cf_food)
print(accuracy_score(food_test, y_pred_food) * 100)

print('---------------------------------------------')

print(best_model_consistency.best_estimator_)
print('\n')
print('Classification report:')
y_pred_consistency = best_model_consistency.predict(reviews_bow_test)
print(classification_report(consistency_test, y_pred_consistency))
cf_consistency = confusion_matrix(consistency_test, y_pred_consistency)
print(cf_consistency)
print(accuracy_score(consistency_test, y_pred_consistency) * 100)

print('---------------------------------------------')

print(best_model_convenience.best_estimator_)
print('\n')
print('Classification report:')
y_pred_convenience = best_model_convenience.predict(reviews_bow_test)
print(classification_report(convenience_test, y_pred_convenience))
cf_convenience = confusion_matrix(convenience_test, y_pred_convenience)
print(cf_convenience)
print(accuracy_score(convenience_test, y_pred_convenience) * 100)

print('---------------------------------------------')

print(best_model_service.best_estimator_)
print('\n')
print('Classification report:')
y_pred_service = best_model_service.predict(reviews_bow_test)
print(classification_report(service_test, y_pred_service))
cf_service = confusion_matrix(service_test, y_pred_service)
print(cf_service)
print(accuracy_score(service_test, y_pred_service) * 100)

print('---------------------------------------------')

print(best_model_value.best_estimator_)
print('\n')
print('Classification report:')
y_pred_value = best_model_value.predict(reviews_bow_test)
print(classification_report(value_test, y_pred_value))
cf_value = confusion_matrix(value_test, y_pred_value)
print(cf_value)
print(accuracy_score(value_test, y_pred_value) * 100)

print('---------------------------------------------')
"""
joblib.dump(best_model_food,'Exported_models/train_oldnew_test_new_food.pkl')
joblib.dump(best_model_service,'Exported_models/train_oldnew_test_new_service.pkl')
joblib.dump(best_model_value,'Exported_models/train_oldnew_test_new_value.pkl')
joblib.dump(best_model_consistency,'Exported_models/train_oldnew_test_new_consistency.pkl')
joblib.dump(best_model_convenience,'Exported_models/train_oldnew_test_new_convenience.pkl')
joblib.dump(best_model_food.best_estimator_, 'Exported_models/train_oldnew_test_new_food_bestmodel.pkl')
joblib.dump(best_model_consistency.best_estimator_, 'Exported_models/train_oldnew_test_new_consistency_bestmodel.pkl')
joblib.dump(best_model_convenience.best_estimator_, 'Exported_models/train_oldnew_test_new_convenience_bestmodel.pkl')
joblib.dump(best_model_service.best_estimator_, 'Exported_models/train_oldnew_test_new_service_bestmodel.pkl')
joblib.dump(best_model_value.best_estimator_, 'Exported_models/train_oldnew_test_new_value_bestmodel.pkl')
"""

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Training model on choice driver food

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1488s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   52.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   52.6s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.5s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  6.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  6.3min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver value

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0638s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1332s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.3min finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.9s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.3min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Training model on choice driver service

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0766s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1570s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   59.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   59.7s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.2s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.6min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver convenience

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0544s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1354s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   40.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   40.9s finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Training model on choice driver consistency

LogisticRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0617s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1381s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


SVM
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.2min finished


RandomForest
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished


GradientBoostingClassifier
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min finished


MultinomialNB
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0561s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.2s finished


SVC(C=2.3, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


Classification report:
              precision    recall  f1-score   support

         0.0       0.80      0.38      0.51       511
         1.0       0.77      0.96      0.85      1116

    accuracy                           0.77      1627
   macro avg       0.79      0.67      0.68      1627
weighted avg       0.78      0.77      0.75      1627

[[ 192  319]
 [  48 1068]]
77.44314689612784
---------------------------------------------
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


Classification report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      1605
         1.0       0.00      0.00      0.00        22

    accuracy                           0.99      1627
   macro avg       0.49

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92      1366
         1.0       0.59      0.58      0.59       261

    accuracy                           0.87      1627
   macro avg       0.75      0.75      0.75      1627
weighted avg       0.87      0.87      0.87      1627

[[1260  106]
 [ 109  152]]
86.78549477566072
---------------------------------------------


"\njoblib.dump(best_model_food,'Exported_models/train_oldnew_test_new_food.pkl')\njoblib.dump(best_model_service,'Exported_models/train_oldnew_test_new_service.pkl')\njoblib.dump(best_model_value,'Exported_models/train_oldnew_test_new_value.pkl')\njoblib.dump(best_model_consistency,'Exported_models/train_oldnew_test_new_consistency.pkl')\njoblib.dump(best_model_convenience,'Exported_models/train_oldnew_test_new_convenience.pkl')\njoblib.dump(best_model_food.best_estimator_, 'Exported_models/train_oldnew_test_new_food_bestmodel.pkl')\njoblib.dump(best_model_consistency.best_estimator_, 'Exported_models/train_oldnew_test_new_consistency_bestmodel.pkl')\njoblib.dump(best_model_convenience.best_estimator_, 'Exported_models/train_oldnew_test_new_convenience_bestmodel.pkl')\njoblib.dump(best_model_service.best_estimator_, 'Exported_models/train_oldnew_test_new_service_bestmodel.pkl')\njoblib.dump(best_model_value.best_estimator_, 'Exported_models/train_oldnew_test_new_value_bestmodel.pkl')\n