In [263]:
import pandas as pd 
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.decomposition import TruncatedSVD

In [264]:
df=pd.read_excel("cadence.xlsx")

In [265]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 10 columns):
Incident: Number      400 non-null int64
Opened Date           400 non-null datetime64[ns]
Queue                 400 non-null object
Category              400 non-null object
Client Location       378 non-null object
Template              276 non-null object
Request Definition    120 non-null object
Short Description     400 non-null object
Ticket_Type           400 non-null object
Description1          400 non-null object
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 31.3+ KB


In [17]:
# pipeline = Pipeline([
#         ('bow', CountVectorizer(ngram_range=(1,2))),  # strings to token integer counts
#         ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
#         ('c1',LinearSVC()),  
#         ('c2',LogisticRegression()),
#         ('c3',RandomForestClassifier(n_estimators=50)),
#         ('eclf',VotingClassifier(estimators=[('lr', c1), ('rf', c2), ('gnb', c3)], voting='hard')),  
#     ])

In [266]:
X_train, X_test, y_train, y_test =  train_test_split(df['Description1'], df['Ticket_Type'], test_size=0.20, random_state=42)

In [267]:
my_vector = CountVectorizer(ngram_range=(1,2))
X_train_counts = my_vector.fit_transform(X_train)


In [268]:
tf_transformer = TfidfTransformer(use_idf=True,).fit(X_train_counts)
X_train= tf_transformer.transform(X_train_counts)

In [269]:
X_test_counts = my_vector.transform(X_test)


In [270]:
X_test = tf_transformer.transform(X_test_counts)

In [271]:
clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = svm.SVC(kernel='linear',probability=True)



In [278]:
le=preprocessing.LabelEncoder()

In [279]:
le.fit(y_test)

LabelEncoder()

In [280]:
y_test=le.transform(y_test)


In [287]:
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

In [288]:
eclf1.fit(X_train,y_train)



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', RandomFo...ar', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [289]:
pred=clf.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[56  2]
 [ 5 17]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.94        58
           1       0.89      0.77      0.83        22

   micro avg       0.91      0.91      0.91        80
   macro avg       0.91      0.87      0.89        80
weighted avg       0.91      0.91      0.91        80

Accuracy :  0.9125


In [295]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np


clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = svm.SVC(kernel='linear',probability=True)
eclf = VotingClassifier(estimators=[('l1',clf1),('l2',clf2), ('l3',clf3)], voting='hard')

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, eclf1], ['Logistic Regression', 'Random Forest', 'naive Bayes',"VotingHard"]):
    
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

5-fold cross validation:

Accuracy: 0.78 (+/- 0.03) [Logistic Regression]




Accuracy: 0.83 (+/- 0.03) [Random Forest]
Accuracy: 0.84 (+/- 0.02) [naive Bayes]




Accuracy: 0.83 (+/- 0.01) [VotingHard]


In [296]:
clf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0])

In [297]:
pred=clf.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[58  0]
 [ 7 15]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        58
           1       1.00      0.68      0.81        22

   micro avg       0.91      0.91      0.91        80
   macro avg       0.95      0.84      0.88        80
weighted avg       0.92      0.91      0.91        80

Accuracy :  0.9125
