## Loading the modules

In [4]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline
import codecs
import re
import enchant

In [5]:
import nltk
import os
import sys

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import time
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC



In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from word2vecUtils import utils
from nltk.stem import SnowballStemmer

In [10]:
filename = "./Data/data_segments2.json"
data = open(filename).read()

In [11]:
#pattern = re.compile(r'{[\n\s]*"_id"[a-zA-Z0-9~`!@#$%^&*()\-_+={}\[\]:;\"\'<>,.?/|\\]*')
#arr = pattern.findall(data)

In [12]:
def load_json_multiple(segments):
    chunk = ""
    for segment in segments:
        chunk += segment
        try:
            yield json.loads(chunk)
            chunk = ""
        except ValueError:
            pass

## Parsing the Json File

In [13]:
data = []
with open(filename) as f:
    for parsed_json in load_json_multiple(f):
        data.append(parsed_json)
print len(data)

113


In [14]:
data1 = []
for i in xrange(len(data)):
    for j in data[i]["transcription_merged"]:
        temp = j
        temp["_id"] = data[i]["_id"]
        data1.append(temp)

In [15]:
df = pd.DataFrame(data1)
df.head()

Unnamed: 0,_id,conv_no,end_time,line,question,segment,speaker,start_time
0,d7da99f7-013b-4d63-84d9-421226ee5249,0.0,30,at indian east india company now by the way wh...,,,Rep,0
1,d7da99f7-013b-4d63-84d9-421226ee5249,1.0,31,HMM.,,,Prospect,30
2,d7da99f7-013b-4d63-84d9-421226ee5249,2.0,34,so you're creating tons of synonyms on the bac...,,,Rep,31
3,d7da99f7-013b-4d63-84d9-421226ee5249,3.0,35,it is manual.,,,Prospect,34
4,d7da99f7-013b-4d63-84d9-421226ee5249,4.0,53,right exactly you got it then next comes is th...,,,Rep,35


In [16]:
df[df.segment=="None"].shape

(2390, 8)

In [17]:
df[~df.segment.isnull()].shape

(4612, 8)

In [18]:
df.segment.value_counts()

None                           2390
Rep questions                   663
Customer questions              523
Next steps and action items     228
Client reference                198
Pricing                         114
Pitch                           110
Agenda                           89
Customer pain points             88
Objection                        65
Competitor differentiation       46
Relevant case study              32
Brilio pitch                     32
Competitor                       16
Closing discussion               13
Next steps                        2
Unbxd pitch                       2
WebEngage pitch                   1
Name: segment, dtype: int64

# Selecting the records for which segment column is not null

In [19]:
df1 = df[~df.segment.isnull()]

In [20]:
df1.head()

Unnamed: 0,_id,conv_no,end_time,line,question,segment,speaker,start_time
11,d7da99f7-013b-4d63-84d9-421226ee5249,11.0,245,the next piece what I really wanted to also hi...,,Pitch,Rep,87
17,d7da99f7-013b-4d63-84d9-421226ee5249,17.0,303,however what Celeb rose is not very good at is...,,Competitor differentiation,Rep,260
19,d7da99f7-013b-4d63-84d9-421226ee5249,19.0,364,yes yes that is interesting to me now one one ...,,Customer pain points,Prospect,318
21,d7da99f7-013b-4d63-84d9-421226ee5249,21.0,407,so we have and I did this for a couple of reas...,,Customer pain points,Prospect,365
24,d7da99f7-013b-4d63-84d9-421226ee5249,24.0,429,you guys offer a link search in navigation.,,Customer questions,Prospect,426


In [None]:
df1["target"] = np.where(df1["segment"]=="Next steps and action items",1,0)

### Changing the datatype

In [None]:
df1.end_time = df1.end_time.astype(np.float64)
df1.start_time = df1.start_time.astype(np.float64)
df1.conv_no = df1.conv_no.astype(np.float64)

In [23]:
df1 = df1.set_index(np.arange(len(df1)))

In [24]:
df1["time_difference"] = df1["end_time"] - df1["start_time"]
df1["time_difference"] = df1["time_difference"]/max(df1["time_difference"])
df1["conv_no"] = df1["conv_no"] / max(df1["conv_no"])

In [25]:
df1.speaker.value_counts()

Rep         2818
Prospect    1794
Name: speaker, dtype: int64

In [26]:
le = LabelEncoder()
df1["speaker"] = le.fit_transform(df1["speaker"])

In [27]:
df1 = df1.drop(["_id","question"],axis=1)

### Cleaning the data for unuwanter characters

In [28]:
df1["line"] = df1["line"].apply(lambda x: re.sub(r"[^\x00-\x7f]"," ",x))
df1["line"] = df1["line"].apply(lambda x: re.sub(r"^[a-zA-Z0-9\s]", "", x))
df1["line"] = df1["line"].apply(str)

In [29]:
df1["line"] = df1["line"].apply(lambda x: re.sub(r"[\s]+"," ", x).strip())
df1["line"] = df1["line"].apply(lambda x: x.lower())

### Calculating the sentiment score

In [30]:
analyzer = SentimentIntensityAnalyzer()

In [31]:
df1['compoundScore'] = df1['line'].apply(lambda x: analyzer.polarity_scores(x)["compound"])
df1['negSent'] = df1['line'].apply(lambda x: analyzer.polarity_scores(x)["neg"])
df1['posSent'] = df1['line'].apply(lambda x: analyzer.polarity_scores(x)["pos"])
df1['neuSent'] = df1['line'].apply(lambda x: analyzer.polarity_scores(x)["neu"])

In [32]:
stop_words = set(stopwords.words('english'))

In [33]:
df1["line"] = df1["line"].apply(lambda x: " ".join([i for i in word_tokenize(x) if i not in stop_words]))

In [34]:
stemmer = SnowballStemmer('english')

In [35]:
df1["line"] = df1["line"].apply(lambda x: " ".join([stemmer.stem(i) for i in word_tokenize(x)]))

In [36]:
df1["line"] = df1["line"].apply(lambda x: " ".join([i for i in word_tokenize(x) if len(x)>2]))

In [37]:
df1.head()

Unnamed: 0,conv_no,end_time,line,segment,speaker,start_time,target,time_difference,compoundScore,negSent,posSent,neuSent
0,0.02965,245.0,next piec realli want also highlight coupl thi...,Pitch,1,87.0,0,0.335456,0.9971,0.018,0.204,0.778
1,0.045822,303.0,owev celeb rose good < > spellcheck n't know g...,Competitor differentiation,1,260.0,0,0.091295,0.7039,0.018,0.077,0.905
2,0.051213,364.0,es yes interest one one one question thing wan...,Customer pain points,0,318.0,0,0.097665,0.9509,0.0,0.163,0.837
3,0.056604,407.0,coupl reason one well first thing get influenc...,Customer pain points,0,365.0,0,0.089172,0.25,0.032,0.053,0.915
4,0.06469,429.0,ou guy offer link search navig .,Customer questions,0,426.0,0,0.006369,0.0,0.0,0.0,1.0


In [66]:
df1["predicted_value"] = np.where(df1["segment"]=="Next steps and action items",1,0)

In [39]:
def review_to_wordlist( review, remove_stopwords=False ):
        review_text = re.sub("[^a-zA-Z]"," ", review)
        words = review_text.lower().split()
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        return(words)

In [40]:
clean_desc = []
for index, row in df1.iterrows():
    clean_desc.append(" ".join(
        review_to_wordlist(row["line"], False)))

#vectorizer = CountVectorizer(max_features=5000)
#data_features = vectorizer.fit_transform(clean_desc)

### Calculating the TFID for the column "line"

In [41]:
tfid_vectorizer = TfidfVectorizer()
data_features = tfid_vectorizer.fit_transform(clean_desc)

In [43]:
np.asarray(data_features)
data_features = data_features.astype(np.float32)
features_df = pd.DataFrame(data_features.todense(), columns=tfid_vectorizer.get_feature_names())

In [44]:
features_df.shape

(4612, 4630)

In [45]:
df_combined = pd.concat([df1, features_df], axis=1)

In [46]:
df_combined = df_combined.drop(["start_time","end_time","line","segment","target","conv_no"],axis=1)

In [47]:
train = df_combined.sample(frac=0.7,random_state=777)
test = df_combined.drop(train.index)

In [48]:
#train = pd.read_csv("./train_sample.csv")
#test = pd.read_csv("./test_sample.csv")

In [49]:
y_train = train["predicted_value"]
y_test = test["predicted_value"]
X_train = train.drop(["predicted_value"],axis=1)
X_test = test.drop(["predicted_value"],axis=1)

In [50]:
X_train.shape, y_train.shape
# train.to_csv("./train_sample.csv",index=False)
# test.to_csv("./test_sample.csv",index=False)

((3228, 4633), (3228,))

In [53]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [52]:
y_train= y_train.as_matrix()
y_test = y_test.as_matrix()
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()

In [57]:
model1.get_params().keys()

['loss',
 'C',
 'verbose',
 'intercept_scaling',
 'fit_intercept',
 'max_iter',
 'penalty',
 'multi_class',
 'random_state',
 'dual',
 'tol',
 'class_weight']

In [68]:
def evaluateModel(model,X_train,y_train,X_test,y_test,testing=True):
    model.fit(X_train,y_train)
    if testing:
        y_pred = model.predict(X_test)
        print "accuracy is ", accuracy_score(y_test, y_pred)
        print classification_report(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        print tn, fp, fn, tp

### Using LinearSVC Model

In [67]:
parameter_candidates = [
  {'C': [0.1, 0.01, 1, 10, 100, 1000]}
]
model1 = LinearSVC()
model_grid1 = GridSearchCV(model1, parameter_candidates)
evaluateModel(model_grid1,X_train,y_train,X_test,y_test)
print('Best score for data1:', model_grid1.best_score_)
print model_grid1.best_estimator_

accuracy is  0.967485549133
             precision    recall  f1-score   support

          0       0.97      0.99      0.98      1309
          1       0.84      0.49      0.62        75

avg / total       0.96      0.97      0.96      1384

1302 7 38 37
('Best score for data1:', 0.96251548946716237)
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


### Using Random Forest Model

In [62]:
model2 = RandomForestClassifier()
evaluateModel(model2,X_train,y_train,X_test,y_test)

accuracy is  0.950867052023
             precision    recall  f1-score   support

          0       0.95      1.00      0.97      1309
          1       0.82      0.12      0.21        75

avg / total       0.94      0.95      0.93      1384

1307 2 66 9


In [63]:
parameter_candidates = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
model_grid2 = GridSearchCV(SVC(),parameter_candidates)
evaluateModel(model_grid2,X_train,y_train,X_test,y_test)
print('Best score for data1:', model_grid2.best_score_)

('Best score for data1:', 0.96189591078066916)


In [64]:
evaluateModel(model_grid2,X_train,y_train,X_test,y_test)

accuracy is  0.96098265896
             precision    recall  f1-score   support

          0       0.97      0.99      0.98      1309
          1       0.80      0.37      0.51        75

avg / total       0.96      0.96      0.95      1384

1302 7 47 28


In [16]:
model_grid2.best_params_

{'C': 1, 'kernel': 'linear'}

In [240]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [237]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.97      0.99      0.98      1309
          1       0.84      0.49      0.62        75

avg / total       0.96      0.97      0.96      1384



In [238]:
print accuracy_score(y_test, y_pred)

0.967485549133


In [244]:
print confusion_matrix(y_test, y_pred)

[[1302    7]
 [  38   37]]


In [243]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)

(1302, 7, 38, 37)

In [164]:
gbm = lgb.LGBMClassifier(n_estimators=3000, max_depth=3, subsample=0.7, colsample_bytree= 0.7)
gbm = gbm.fit(X_train, y_train)

In [188]:
y_pred = gbm.predict_proba(X_test)

In [189]:
y_pred

array([[  9.99082292e-01,   8.85479835e-04,   3.22285569e-05],
       [  9.99840524e-01,   1.46293847e-04,   1.31820306e-05],
       [  9.99720408e-01,   2.74870572e-04,   4.72132789e-06],
       ..., 
       [  9.97216883e-01,   2.45843813e-03,   3.24678593e-04],
       [  9.99437219e-01,   4.98783467e-04,   6.39973732e-05],
       [  9.99512944e-01,   4.63790575e-04,   2.32655399e-05]])

In [177]:
y_result = []
magic = 0.64
for i in range(0, len(X_test)):
    if y_pred[i][0] > magic:
        y_result.append(0)
    else:
        y_result.append(1)

In [180]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [179]:
accuracy_score(y_test,y_result)

0.99240780911062909

In [185]:
print metrics.classification_report(y_test,y_result)

             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00       915
        1.0       0.00      0.00      0.00         6
        2.0       0.00      0.00      0.00         1

avg / total       0.98      0.99      0.99       922



In [186]:
np.savetxt('lgb',y_pred,delimiter = ',', fmt = '%0.6f')