In [1]:
# handle dataset
import numpy as np
import pandas as pd
# Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import string
#Model Building
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
# Model Persist
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report



In [4]:
df = pd.read_csv('chat_room.csv')
df.head()

Unnamed: 0,Query,Intent
0,who are you?,s_talk.acquaintance
1,all about you,s_talk.acquaintance
2,what is your personality,s_talk.acquaintance
3,define yourself,s_talk.acquaintance
4,what are you,s_talk.acquaintance


In [5]:
def clean_text(text):
    text = text.translate(string.punctuation)
    text = text.lower().split()
    stops = set(stopwords.words('english'))
    text = [w for w in text if len(w)  >= 2]
    text = " ".join(text)
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    text = " ".join(text)
    
    return text

In [6]:
df['Query'] = df['Query'].map(lambda x: clean_text(x))

In [7]:
df = shuffle(df,random_state = 0)

In [8]:
X_train,X_test,y_train,y_test = train_test_split(df['Query'],df['Intent'],random_state = 0)

In [9]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [10]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [11]:
# model = MultinomialNB().fit(X_train_counts,y_train)
model = SVC(kernel='linear').fit(X_train_counts,y_train)

In [12]:
print("\nTrain Score:",model.score(count_vect.transform(X_train), y_train))
print("\nTest Score:",model.score(count_vect.transform(X_test), y_test))


Train Score: 0.99644128113879

Test Score: 0.9680851063829787


In [13]:
word = clean_text("what is my recommendation for the month")
res_lst = model.predict((count_vect.transform([word])))
# res_lst[0]
# classes = model.classes_
# classes[np.argmax(res_lst)]
# word
res_lst

array(['buss.recommd'], dtype=object)

In [14]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC(kernel = 'linear',probability = True))])

In [15]:
tuned_parameters = {

}

In [16]:
clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
clf.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                           

In [17]:
print(classification_report(y_test, clf.predict(X_test), digits=4))

                     precision    recall  f1-score   support

       buss.recommd     1.0000    1.0000    1.0000         4
         buss.spend     1.0000    1.0000    1.0000         7
s_talk.acquaintance     1.0000    0.6667    0.8000         3
   s_talk.beautiful     0.8667    1.0000    0.9286        13
       s_talk.bored     1.0000    0.8000    0.8889         5
         s_talk.bye     1.0000    1.0000    1.0000         8
      s_talk.clever     0.9487    0.9737    0.9610        38
 s_talk.goodevening     1.0000    1.0000    1.0000         3
 s_talk.goodmorning     1.0000    1.0000    1.0000         5
   s_talk.goodnight     1.0000    1.0000    1.0000         6
       s_talk.hello     1.0000    0.5000    0.6667         2

           accuracy                         0.9574        94
          macro avg     0.9832    0.9037    0.9314        94
       weighted avg     0.9608    0.9574    0.9550        94



In [18]:
clf.best_params_

{}

In [19]:
X_train[371]

'what all the purchased for the catergory food'

In [21]:
arg = np.argmax(clf.predict_proba([X_train[371]]))
arg

1

In [22]:
joblib.dump(clf,"NB_Cbot.pkl")

['NB_Cbot.pkl']

In [23]:
model_NB = joblib.load("NB_Cbot.pkl")

In [24]:
model_NB.predict_proba([X_train[371]])

array([[0.01745005, 0.85743366, 0.0233591 , 0.01023244, 0.00683468,
        0.00909718, 0.00596822, 0.02039683, 0.01340812, 0.00461193,
        0.00710543, 0.00947317, 0.01462918]])

'buss.spend'

In [43]:
confidence = clf.predict_proba([X_train[371]])[0][arg]
intent = clf.classes_[arg]

0.8586606806913137