In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
import itertools
from sklearn.model_selection import cross_val_score

In [2]:
df=pd.read_excel("Stopwords_Removed_Ticket_TypeWithoutClient.xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79053 entries, 0 to 79052
Data columns (total 10 columns):
Incident: Number      79053 non-null int64
Opened Date           79053 non-null datetime64[ns]
Queue                 79053 non-null object
Category              79053 non-null object
Client Location       69515 non-null object
Template              47271 non-null object
Request Definition    28023 non-null object
Short Description     79053 non-null object
Ticket_Type           79053 non-null object
Description1          78956 non-null object
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 6.0+ MB


In [4]:
df.dropna(subset=["Description1"],inplace=True)

In [5]:
X_train, X_test, y_train, y_test =  train_test_split(df['Description1'], df['Ticket_Type'], test_size=0.20, random_state=42)

In [6]:
my_vector = CountVectorizer(ngram_range=(1,2))
X_train_counts = my_vector.fit_transform(X_train)

In [7]:
tf_transformer = TfidfTransformer(use_idf=True,).fit(X_train_counts)
X_train= tf_transformer.transform(X_train_counts)

In [8]:
X_test_counts = my_vector.transform(X_test)

In [9]:
X_test = tf_transformer.transform(X_test_counts)

In [10]:
le=preprocessing.LabelEncoder()

In [11]:
le.fit(y_train)

LabelEncoder()

In [13]:
le.fit(y_test)

LabelEncoder()

In [12]:
y_train=le.transform(y_train)

In [14]:
y_test=le.transform(y_test)

In [15]:
ad=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', max_depth=1),n_estimators=10,learning_rate=1)

In [16]:
ad.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1, n_estimators=10, random_state=None)

In [17]:
pred=ad.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[10048   222]
 [ 3523  1999]]
             precision    recall  f1-score   support

          0       0.74      0.98      0.84     10270
          1       0.90      0.36      0.52      5522

avg / total       0.80      0.76      0.73     15792

Accuracy :  0.762854609929


In [18]:
clf=DecisionTreeClassifier(criterion='entropy', max_depth=1)
num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=10)']

In [19]:
grid = itertools.product()
bg_clf_cv_mean = []
bg_clf_cv_std = []

for n_est, label, grd in zip(num_est, label, grid):     
    boosting = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est)  
    
    scores = cross_val_score(boosting , X_train, y_train, cv=3, scoring='accuracy')
    
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    
    boosting.fit(X_train, y_train)
    bg_clf_cv_mean.append(scores.mean())
    bg_clf_cv_std.append(scores.std())


Accuracy: 0.69 (+/- 0.00) [AdaBoost (n_est=1)]


In [20]:
pred=boosting.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[9838  432]
 [4381 1141]]
             precision    recall  f1-score   support

          0       0.69      0.96      0.80     10270
          1       0.73      0.21      0.32      5522

avg / total       0.70      0.70      0.63     15792

Accuracy :  0.695225430598


In [21]:
gb=GradientBoostingClassifier()

In [24]:
gb.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [25]:
pred=gb.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

[[10012   258]
 [ 2355  3167]]
             precision    recall  f1-score   support

          0       0.81      0.97      0.88     10270
          1       0.92      0.57      0.71      5522

avg / total       0.85      0.83      0.82     15792

Accuracy :  0.834536474164


In [2]:
import lightgbm as lgb