In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

import pandas as pd
pd.options.display.max_colwidth = 20
import numpy as np
import os

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek

from collections import Counter

In [3]:
pwd

'/Users/zacklarsen/Dropbox (Inference Analytics)/Inference Analytics Team Folder/Zack Work'

In [4]:
os.chdir('/Users/zacklarsen/Desktop/')
Chats2 = pd.read_csv('Chats2.csv',dtype={'ChatID': np.int64, 'Text': str, 'Expert label (1=design,2=integration,3=transaction)':np.int32},nrows=80)
Chats = Chats2.head(80)

In [5]:
Chats.tail()

Unnamed: 0,ChatID,Text,"Expert label (1=design,2=integration,3=transaction)"
75,447518000000000000,"Hello, my name i...",2
76,447518000000000000,"Hello, my name i...",4
77,447518000000000000,"Hello, my name i...",4
78,447518000000000000,"Hello, my name i...",4
79,447518000000000000,"\r\rHello, my na...",2


In [6]:
Chats['Text'][70]

'Under Specifications,Hello, my name is Carol, how may I help you?,What is FSO? Full Scale,FSO&nbsp; Full Scale Output.,Ok Thanks,Thanks for contacting us. Please do not hesitate to contact us again if we can be of further assistance.'

In [7]:
Chats['ChatID'][70]

447518000000000000

In [8]:
# ## Create a dictionary of customerID,description pairs
token_dict = pd.Series(Chats['Text'].values,index=Chats['ChatID']).to_dict()
token_dict

# ## Print first n (key,value) pairs in dictionary
#n = 3
#keys = sorted(token_dict.keys())
#{k: token_dict[k] for k in keys[:n]}

{447517000000000000: 'Yes, how may I help you sir?,i want a digital flow meter for gases of 100 lpm-200lpm with having datalogging,kindly hold on while I do a check,by the way,, what is the fluid type?,sorry which type of gas?,air,ambient air,you will need a separate data logger to the flow meter,http://sea.omega.com/sg/pptst/FMA-LP2600A.html&nbsp;contain a meter with 4-20mA output,Kindly review to see if it is suitable for your application.,http://in.omega.com/pptst/OM-CP-PROCESS101A.html&nbsp;is the datalogger for your review as well,it is a full setup i want a handheld digital flow meter, it would be better if their exists datalogging option,Kindly hold on sir, I currently have 3 lines running.,Hi Mr Sanjay, we will sent the information to your email once we get the product suitable to your requirement.,ok. thanks.please do needful. ASAP.,Will do that . thanks',
 447518000000000000: "\r\rHello, my name is Chet, how may I help you?,Hi Chet,,I am using your wtt-6-24-tt and need to cha

In [9]:
# Use line below for just unigrams
#v = TfidfVectorizer()

# Use line below for unigrams, bigrams, and trigrams
v = TfidfVectorizer(ngram_range=(2,3))  



X = v.fit_transform(Chats['Text']).toarray()

convoDF = pd.DataFrame() # Initialize new DF

for i, col in enumerate(v.get_feature_names()):
    convoDF[col] = X[:, i]

In [10]:
convoDF.head()

Unnamed: 0,00 each,00 each with,00 indent,00 indent about,00 iserver,00 iserver bar,00 lbs,00 lbs found,00 ok,00 ok thanks,...,âºâ hold,âºâ hold on,â½ hour,â½ hour thank,œdoor openâ,œdoor openâ press,œreset data,œreset data press,œyes now,œyes now your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
convoDF.columns.values[:1000]

array(['00 each', '00 each with', '00 indent', '00 indent about',
       '00 iserver', '00 iserver bar', '00 lbs', '00 lbs found', '00 ok',
       '00 ok thanks', '00 please', '00 please hold', '00 reference',
       '00 reference number', '016vdc all', '016vdc all right', '03 14',
       '03 14 probably', '03 17', '03 17 2016', '03 24', '03 24 omega',
       '03 25', '03 25 03', '034volts when', '034volts when just', '04 16',
       '04 16 but', '04 16 shipment', '04 16 the', '06 03', '06 03 14',
       '07 5gpm', '07 5gpm 26lpm', '078 ok', '078 ok what', '08 am',
       '08 am created', '08 am this', '0800 488', '0800 488 488',
       '087 2574', '087 2574 078', '0lpm 2lpm', '0lpm 2lpm hi', '10 51',
       '10 51 08', '10 bar', '10 bar please', '10 cm', '10 cm length',
       '10 for', '10 for that', '10 in', '10 in stock', '10 is',
       '10 is bit', '10 oily', '10 oily product', '100 hi', '100 hi ben',
       '100 lpm', '100 lpm 200lpm', '100 psi', '100 psi pressure',
       '100 

In [12]:
X.shape

(80, 18712)

In [13]:
convoDF.head()

Unnamed: 0,00 each,00 each with,00 indent,00 indent about,00 iserver,00 iserver bar,00 lbs,00 lbs found,00 ok,00 ok thanks,...,âºâ hold,âºâ hold on,â½ hour,â½ hour thank,œdoor openâ,œdoor openâ press,œreset data,œreset data press,œyes now,œyes now your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
Y = Chats['Expert label (1=design,2=integration,3=transaction)']

In [15]:
scaler = StandardScaler()
scaler.fit(convoDF)
x_s = scaler.transform(convoDF)
x_s
Y.head()

StandardScaler(copy=True, with_mean=True, with_std=True)

array([[-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       ..., 
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879]])

0    3
1    2
2    3
3    2
4    3
Name: Expert label (1=design,2=integration,3=transaction), dtype: int32

In [16]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(x_s, Y, test_size=0.2, random_state=42)
X_train
y_train[:5]

array([[-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       ..., 
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879,  8.88819442, ..., -0.11250879,
        -0.11250879, -0.11250879],
       [-0.11250879, -0.11250879, -0.11250879, ..., -0.11250879,
        -0.11250879, -0.11250879]])

73    3
61    1
55    4
40    4
9     2
Name: Expert label (1=design,2=integration,3=transaction), dtype: int32

In [17]:
## Perform oversampling technique
# https://beckernick.github.io/oversampling-modeling/
        
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))


sm = SMOTE(random_state=42,kind = "regular")
X_res, y_res = sm.fit_sample(X_train, y_train)
print(sorted(Counter(y_res).items()))

[(1, 19), (2, 19), (3, 19), (4, 19)]
[(1, 19), (2, 19), (3, 19), (4, 19)]


In [18]:
Counter(y_train)
Counter(y_res)

Counter({1: 9, 2: 18, 3: 19, 4: 18})

Counter({1: 19, 2: 19, 3: 19, 4: 19})

In [19]:
# Use Principal Component Analysis to reduce dimensionality
# and improve generalization
pca = PCA()

# Use support vector machine classifier
svc = SVC()

# Use Gaussian Naive Bayes classifier
#mnb = MultinomialNB()

# Combine PCA and SVC and MNB to a pipeline
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

# Vary the PCA components
n_components = [10, 50, 150]

# Define the parameters to search over
params_grid = { 
    'pca__n_components': n_components,
    'svc__C': [1, 10, 100, 1000],
    'svc__kernel': ['linear', 'rbf', 'sigmoid'],
    'svc__gamma' : [0.001, 0.0001],
}

estimator = GridSearchCV(pipe, params_grid)
#estimator.fit(X_train, y_train)
estimator.fit(X_res, y_res)

print (estimator.best_params_, estimator.best_score_)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svc__kernel': ['linear', 'rbf', 'sigmoid'], 'svc__C': [1, 10, 100, 1000], 'svc__gamma': [0.001, 0.0001], 'pca__n_components': [10, 50, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

{'svc__kernel': 'sigmoid', 'svc__C': 1, 'svc__gamma': 0.0001, 'pca__n_components': 50} 0.578947368421


In [20]:
estimator.best_score_

0.57894736842105265

In [21]:
estimator.best_estimator_

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [22]:
y_pred = estimator.best_estimator_.predict(X_test)

In [23]:
confusion_matrix(y_test, y_pred)

array([[0, 0, 1, 0],
       [0, 1, 2, 0],
       [0, 0, 6, 0],
       [0, 1, 5, 0]])

In [24]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         1
          2       0.50      0.33      0.40         3
          3       0.43      1.00      0.60         6
          4       0.00      0.00      0.00         6

avg / total       0.25      0.44      0.30        16



  'precision', 'predicted', average, warn_for)


## Save model to pickle file

In [32]:
from sklearn.externals import joblib
joblib.dump(estimator, 'model.pkl')

['model.pkl']

In [None]:
# Load it back in

clf = joblib.load('model.pkl')

## Further refine the model

In [25]:
# Use Principal Component Analysis to reduce dimensionality
# and improve generalization
pca = PCA()

# Use support vector machine classifier
svc = SVC()

# Combine PCA and SVC and MNB to a pipeline
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])


# Define the parameters to search over
params_grid = { 
    'pca__n_components': [30,50],
    'svc__C': [0.01,0.001,0.5],
    'svc__kernel': ['rbf', 'sigmoid'],
    'svc__gamma' : [0.0001, 0.001,0.005],
}

estimator = GridSearchCV(pipe, params_grid)
#estimator.fit(X_train, y_train)
estimator.fit(X_res, y_res)

print (estimator.best_params_, estimator.best_score_)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svc__kernel': ['rbf', 'sigmoid'], 'svc__C': [0.01, 0.001, 0.5], 'svc__gamma': [0.0001, 0.001, 0.005], 'pca__n_components': [30, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

{'svc__kernel': 'sigmoid', 'svc__C': 0.01, 'svc__gamma': 0.0001, 'pca__n_components': 50} 0.460526315789


In [26]:
estimator.best_score_

0.46052631578947367

In [27]:
estimator.best_estimator_

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [28]:
y_pred = estimator.best_estimator_.predict(X_test)

In [29]:
confusion_matrix(y_test, y_pred)

array([[0, 0, 1, 0],
       [0, 0, 3, 0],
       [0, 0, 6, 0],
       [0, 0, 6, 0]])

In [30]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         1
          2       0.00      0.00      0.00         3
          3       0.38      1.00      0.55         6
          4       0.00      0.00      0.00         6

avg / total       0.14      0.38      0.20        16



  'precision', 'predicted', average, warn_for)


## Gaussian Naive Bayes
## http://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/

In [183]:
GNB = GaussianNB()
GNB.fit(X_train, y_train)

GaussianNB(priors=None)

In [186]:
y_pred = GNB.predict(X_test)

In [187]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.5625

In [189]:
confusion_matrix(y_test, y_pred)

array([[0, 1, 0, 0],
       [0, 3, 0, 0],
       [0, 3, 2, 1],
       [0, 1, 1, 4]])

In [188]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.00      0.00      0.00         1
          2       0.38      1.00      0.55         3
          3       0.67      0.33      0.44         6
          4       0.80      0.67      0.73         6

avg / total       0.62      0.56      0.54        16



  'precision', 'predicted', average, warn_for)


In [199]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

ValueError: multiclass format is not supported

## https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html