In [2]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score,classification_report
from sklearn.pipeline import Pipeline
import pickle

In [3]:
df = pd.read_csv('df_text_new.csv')
df.head()

Unnamed: 0,query,verbs,nouns,verb_noun,Subject,Intent,Sentiment
0,could you help me cancelling the last order I ...,"help,cancelling,made",order,"help,cancelling,made,order",order,cancellation,Positive
1,could you help me cancelling an order I made?,"help,cancelling,made",order,"help,cancelling,made,order",order,cancellation,Positive
2,help me cancelling an order I made,"help,cancelling,made",order,"help,cancelling,made,order",order,cancellation,Positive
3,help me cancelling the last order I have made,"help,cancelling,made",order,"help,cancelling,made,order",order,cancellation,Positive
4,help with cancelling the order I have made,"help,cancelling,made",order,"help,cancelling,made,order",order,cancellation,Positive


In [4]:
df['Subject'].value_counts()

account     2727
order       1239
purchase     393
Name: Subject, dtype: int64

In [140]:
# First_model to get the subject
df_subject = df[['verb_noun','Subject']].copy()
df_subject['verb_noun'] = df_subject['verb_noun'].str.strip(',')
df_subject['verb_noun'] = df_subject['verb_noun'].apply(lambda x: ' '.join(x.split(',')))
df_subject.head(2)

Unnamed: 0,verb_noun,Subject
0,help cancelling made order,order
1,help cancelling made order,order


In [141]:
X = df_subject[['verb_noun']].copy()
y = df_subject['Subject'].copy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=1234)

In [142]:
vectorizer_sub = TfidfVectorizer()
X_train_tfidf = pd.DataFrame(vectorizer_sub.fit_transform(X_train['verb_noun']).toarray())

In [143]:
model_RF = RandomForestClassifier()
params_RF = {'criterion':['gini','entropy'],
         'n_estimators':list(range(101,902,2)),
         'min_samples_leaf':list(range(1,10)),
         'max_features':list(range(1,15)),
         'max_samples':list(np.arange(0.1,1))}
grid_search_RF = RandomizedSearchCV(estimator = model_RF, param_distributions = params_RF, cv = 10,n_jobs=-1,scoring='accuracy',verbose=2)
grid_search_RF.fit(X_train_tfidf,y_train.values.ravel())
RF_model_subject = grid_search_RF.best_estimator_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [144]:
y_pred_sub = RF_model_subject.predict(pd.DataFrame(vectorizer_sub.transform(X_test['verb_noun'].values).toarray()))

In [145]:
print(classification_report(y_test,y_pred_sub))

              precision    recall  f1-score   support

     account       1.00      1.00      1.00       827
       order       1.00      0.99      0.99       355
    purchase       0.97      1.00      0.98       126

    accuracy                           1.00      1308
   macro avg       0.99      1.00      0.99      1308
weighted avg       1.00      1.00      1.00      1308



In [146]:
file = open('vectorizer_sub.pkl','wb')
pickle.dump(vectorizer_sub,file)
file.close()

In [147]:
file = open('model_subject.pkl','wb')
pickle.dump(RF_model_subject,file)
file.close()

In [148]:
vec_sub = pickle.load(open('vectorizer_sub.pkl','rb'))
model_sub = pickle.load(open('model_subject.pkl','rb'))

Y_pred_sub = model_sub.predict(pd.DataFrame(vec_sub.transform(X['verb_noun'].values).toarray()))

In [150]:
print(classification_report(y,Y_pred_sub))

              precision    recall  f1-score   support

     account       1.00      1.00      1.00      2727
       order       1.00      0.99      1.00      1239
    purchase       0.98      1.00      0.99       393

    accuracy                           1.00      4359
   macro avg       0.99      1.00      0.99      4359
weighted avg       1.00      1.00      1.00      4359



In [155]:
# second_model to get the intent
df_intent = df[['query','Intent']].copy()
df_intent.head(2)

Unnamed: 0,query,Intent
0,could you help me cancelling the last order I ...,cancellation
1,could you help me cancelling an order I made?,cancellation


In [156]:
X = df_intent[['query']].copy()
y = df_intent['Intent'].copy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=1234)

In [157]:
vectorizer_int = TfidfVectorizer()
X_train_tfidf = pd.DataFrame(vectorizer_int.fit_transform(X_train['query']).toarray())

In [158]:
model_RF = RandomForestClassifier()
params_RF = {'criterion':['gini','entropy'],
         'n_estimators':list(range(101,902,2)),
         'min_samples_leaf':list(range(1,10)),
         'max_features':list(range(1,15)),
         'max_samples':list(np.arange(0.1,1))}
grid_search_RF = RandomizedSearchCV(estimator = model_RF, param_distributions = params_RF, cv = 10,n_jobs=-1,scoring='accuracy',verbose=2)
grid_search_RF.fit(X_train_tfidf,y_train.values.ravel())
RF_model_intent = grid_search_RF.best_estimator_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [159]:
y_pred_int = RF_model_intent.predict(pd.DataFrame(vectorizer_int.transform(X_test['query'].values).toarray()))

In [160]:
print(classification_report(y_test,y_pred_int))

              precision    recall  f1-score   support

cancellation       1.00      0.90      0.95        40
     enquiry       0.97      0.87      0.92       374
modification       0.92      0.97      0.94       147
 restoration       1.00      1.00      1.00       332
      set_up       0.92      1.00      0.95       415

    accuracy                           0.95      1308
   macro avg       0.96      0.95      0.95      1308
weighted avg       0.96      0.95      0.95      1308



In [161]:
file = open('vectorizer_int.pkl','wb')
pickle.dump(vectorizer_int,file)
file.close()

In [162]:
file = open('model_intent.pkl','wb')
pickle.dump(RF_model_intent,file)
file.close()

In [163]:
vec_int = pickle.load(open('vectorizer_int.pkl','rb'))
model_int = pickle.load(open('model_intent.pkl','rb'))

Y_pred_int = model_int.predict(pd.DataFrame(vec_int.transform(X['query'].values).toarray()))

In [164]:
print(classification_report(y,Y_pred_int))

              precision    recall  f1-score   support

cancellation       1.00      0.94      0.97       131
     enquiry       0.99      0.90      0.94      1246
modification       0.93      0.99      0.96       502
 restoration       1.00      1.00      1.00      1146
      set_up       0.94      1.00      0.97      1334

    accuracy                           0.97      4359
   macro avg       0.97      0.97      0.97      4359
weighted avg       0.97      0.97      0.97      4359



In [165]:
# third_model to get the sentiment
df_sentiment = df[['query','Sentiment']].copy()
df_sentiment.head(2)

Unnamed: 0,query,Sentiment
0,could you help me cancelling the last order I ...,Positive
1,could you help me cancelling an order I made?,Positive


In [166]:
X = df_sentiment[['query']].copy()
y = df_sentiment['Sentiment'].copy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=1234)

In [167]:
vectorizer_senti = TfidfVectorizer()
X_train_tfidf = pd.DataFrame(vectorizer_senti.fit_transform(X_train['query']).toarray())

In [168]:
model_RF = RandomForestClassifier()
params_RF = {'criterion':['gini','entropy'],
         'n_estimators':list(range(101,902,2)),
         'min_samples_leaf':list(range(1,10)),
         'max_features':list(range(1,15)),
         'max_samples':list(np.arange(0.1,1))}
grid_search_RF = RandomizedSearchCV(estimator = model_RF, param_distributions = params_RF, cv = 10,n_jobs=-1,scoring='accuracy',verbose=2)
grid_search_RF.fit(X_train_tfidf,y_train.values.ravel())
RF_model_sentiment = grid_search_RF.best_estimator_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [169]:
y_pred_senti = RF_model_sentiment.predict(pd.DataFrame(vectorizer_senti.transform(X_test['query'].values).toarray()))

In [170]:
print(classification_report(y_test,y_pred_senti))

              precision    recall  f1-score   support

    Negative       0.96      0.87      0.91       208
     Neutral       0.95      0.98      0.96       575
    Positive       0.97      0.97      0.97       525

    accuracy                           0.96      1308
   macro avg       0.96      0.94      0.95      1308
weighted avg       0.96      0.96      0.96      1308



In [171]:
file = open('vectorizer_senti.pkl','wb')
pickle.dump(vectorizer_senti,file)
file.close()

In [172]:
file = open('model_sentiment.pkl','wb')
pickle.dump(RF_model_sentiment,file)
file.close()

In [173]:
vec_senti = pickle.load(open('vectorizer_senti.pkl','rb'))
model_senti = pickle.load(open('model_sentiment.pkl','rb'))

Y_pred_int = model_senti.predict(pd.DataFrame(vec_senti.transform(X['query'].values).toarray()))

In [174]:
print(classification_report(y,Y_pred_int))

              precision    recall  f1-score   support

    Negative       0.98      0.91      0.94       710
     Neutral       0.97      0.99      0.98      1937
    Positive       0.98      0.98      0.98      1712

    accuracy                           0.97      4359
   macro avg       0.98      0.96      0.97      4359
weighted avg       0.97      0.97      0.97      4359

