In [1]:
import pandas as pd 
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from mlxtend.classifier import StackingClassifier
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.model_selection import cross_val_score
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=pd.read_excel("Stopwords_Removed_Ticket_TypeWithoutClient.xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79053 entries, 0 to 79052
Data columns (total 10 columns):
Incident: Number      79053 non-null int64
Opened Date           79053 non-null datetime64[ns]
Queue                 79053 non-null object
Category              79053 non-null object
Client Location       69515 non-null object
Template              47271 non-null object
Request Definition    28023 non-null object
Short Description     79053 non-null object
Ticket_Type           79053 non-null object
Description1          78956 non-null object
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 6.0+ MB


In [4]:
df.dropna(subset=["Description1"],inplace=True)

In [5]:
X_train, X_test, y_train, y_test =  train_test_split(df['Description1'], df['Ticket_Type'], test_size=0.20, random_state=42)

In [6]:
my_vector = CountVectorizer(ngram_range=(1,2))
X_train_counts = my_vector.fit_transform(X_train)

In [7]:
tf_transformer = TfidfTransformer(use_idf=True,).fit(X_train_counts)
X_train= tf_transformer.transform(X_train_counts)

In [8]:
X_test_counts = my_vector.transform(X_test)


In [9]:
X_test = tf_transformer.transform(X_test_counts)

In [13]:
le=preprocessing.LabelEncoder()

In [14]:
le.fit(y_train)

LabelEncoder()

In [16]:
le.fit(y_test)

LabelEncoder()

In [15]:
y_train=le.transform(y_train)


In [17]:
y_test=le.transform(y_test)

In [18]:
clf1 = svm.SVC(kernel='linear',probability=True)
clf2 = RandomForestClassifier(n_estimators=10,random_state=1)
clf3 = DecisionTreeClassifier(criterion='entropy')
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

In [19]:
sclf.fit(X_train,y_train)

StackingClassifier(average_probas=False,
          classifiers=[SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False), RandomForestClassifier(bootstrap=True, class_weight=None, c...       min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')],
          meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=0)

In [20]:
pred=sclf.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[9183 1087]
 [1047 4475]]
             precision    recall  f1-score   support

          0       0.90      0.89      0.90     10270
          1       0.80      0.81      0.81      5522

avg / total       0.87      0.86      0.86     15792

Accuracy :  0.864868287741


In [None]:
sclf.predict(X_test)

In [None]:
label = ['KNN', 'Random Forest', 'Naive Bayes', 'Stacking Classifier']
clf_list = [clf1, clf2, clf3, sclf]

clf_cv_mean = []
clf_cv_std = []
for clf, label in zip(clf_list, label):
        
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())
        
    clf.fit(X_train, y_train)


In [None]:
pred=clf.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))