In [28]:
import pandas as pd 
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.model_selection import cross_val_score
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions

In [29]:
df=pd.read_excel("Stopwords_Removed_Ticket_TypeWithoutClient.xlsx")

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79053 entries, 0 to 79052
Data columns (total 10 columns):
Incident: Number      79053 non-null int64
Opened Date           79053 non-null datetime64[ns]
Queue                 79053 non-null object
Category              79053 non-null object
Client Location       69515 non-null object
Template              47271 non-null object
Request Definition    28023 non-null object
Short Description     79053 non-null object
Ticket_Type           79053 non-null object
Description1          78956 non-null object
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 6.0+ MB


In [31]:
df.dropna(subset=["Description1"],inplace=True)

In [32]:
X_train, X_test, y_train, y_test =  train_test_split(df['Description1'], df['Ticket_Type'], test_size=0.20, random_state=42)

In [33]:
my_vector = CountVectorizer(ngram_range=(1,2))
X_train_counts = my_vector.fit_transform(X_train)

In [34]:
tf_transformer = TfidfTransformer(use_idf=True,).fit(X_train_counts)
X_train= tf_transformer.transform(X_train_counts)

In [35]:
X_test_counts = my_vector.transform(X_test)

In [36]:
X_test = tf_transformer.transform(X_test_counts)

In [37]:
le=preprocessing.LabelEncoder()

In [38]:
le.fit(y_train)

LabelEncoder()

In [40]:
le.fit(y_test)

LabelEncoder()

In [39]:
y_train=le.transform(y_train)

In [41]:
y_test=le.transform(y_test)

In [61]:
clf1 = DecisionTreeClassifier(criterion='entropy',random_state=1)
clf2 = KNeighborsClassifier(n_neighbors=1)    

bagging1 = BaggingClassifier(base_estimator=clf1, n_estimators=10)
bagging2 = BaggingClassifier(base_estimator=clf2, n_estimators=10, max_samples=0.8, max_features=0.8)

In [62]:
bagging1.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [63]:
pred=bagging1.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[9698  572]
 [1162 4360]]
             precision    recall  f1-score   support

          0       0.89      0.94      0.92     10270
          1       0.88      0.79      0.83      5522

avg / total       0.89      0.89      0.89     15792

Accuracy :  0.890197568389


In [None]:
bagging2.fit(X_train,y_train)

In [None]:
pred=bagging2.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

In [45]:
rand=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=1)

In [46]:
rand.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [47]:
pred=rand.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[9846  424]
 [1235 4287]]
             precision    recall  f1-score   support

          0       0.89      0.96      0.92     10270
          1       0.91      0.78      0.84      5522

avg / total       0.90      0.89      0.89     15792

Accuracy :  0.894946808511


In [None]:
num_est = map(int, np.linspace(1,100,20))
bg_clf_cv_mean = []
bg_clf_cv_std = []
for n_est in num_est:    
    bg_clf = BaggingClassifier(base_estimator=clf1, n_estimators=n_est, max_samples=0.8, max_features=0.8)
    scores = cross_val_score(bg_clf, X_train, y_train, cv=3, scoring='accuracy')
    bg_clf_cv_mean.append(scores.mean())
    bg_clf_cv_std.append(scores.std())
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), n_est))

In [None]:
label = ['Decision Tree', 'K-NN', 'Bagging Tree', 'Bagging K-NN']
clf_list = [clf1, clf2, bagging1, bagging2]

grid = itertools.product([0,1],repeat=2)

for clf, label, grd in zip(clf_list, label, grid):        
    scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    clf.fit(X_train, y_train)

In [None]:
pred=clf.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

In [59]:
pred[0:50]

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0], dtype=int64)

In [60]:
y_test[0:50]

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0], dtype=int64)