In [1]:
import pandas as pd 
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
df=pd.read_excel('Stopwords_Removed_Ticket_TypeWithoutClient.xlsx',sheet_name="Sheet1",encoding='utf8')

In [3]:
df['Ticket_Type'].value_counts()

Incident    51030
Request     28023
Name: Ticket_Type, dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79053 entries, 0 to 79052
Data columns (total 10 columns):
Incident: Number      79053 non-null int64
Opened Date           79053 non-null datetime64[ns]
Queue                 79053 non-null object
Category              79053 non-null object
Client Location       69515 non-null object
Template              47271 non-null object
Request Definition    28023 non-null object
Short Description     79053 non-null object
Ticket_Type           79053 non-null object
Description1          78956 non-null object
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 6.0+ MB


In [5]:
df.dropna(subset=["Description1"],inplace=True)

In [None]:
pipeline = Pipeline([
        ('bow', CountVectorizer(ngram_range=(1,2))),  # strings to token integer counts
        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        #('Normalize', preprocessing.Normalizer()),
        ('c',LinearSVC()),  
    ])

In [6]:
pipeline = Pipeline([
        ('bow', CountVectorizer(ngram_range=(1,2))),  # strings to token integer counts
        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        #('Normalize', preprocessing.Normalizer()),
        ('c',svm.SVC()),  
    ])

In [None]:
pipeline = Pipeline([
        ('bow', CountVectorizer(ngram_range=(1,2))),  # strings to token integer counts
        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        #('Normalize', preprocessing.Normalizer()),
        ('c',LogisticRegression()),  
    ])

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(df['Description1'], df['Ticket_Type'], test_size=0.20, random_state=42)

In [None]:
A_train, A_test, b_train, b_test =  train_test_split(df['Short Description'], df['Request Definition'], test_size=0.20, random_state=42)

In [None]:
M_train, M_test, n_train, n_test =  train_test_split(df['Short Description'], df['Template'], test_size=0.20, random_state=42)

In [7]:
X_train=df['Description1']
y_train=df['Ticket_Type']

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
pred=pipeline.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy : ",accuracy_score(pred,y_test))

In [8]:
param_grid = {'c__probability':[True,False],'c__degree':[3,4,5],'c__kernel':['linear',],'c__C': [0.1,1, 10, 100, 1000], 'c__gamma':[0.1,1, 10, 100, 1000]} 

In [9]:
grid_search = GridSearchCV(pipeline,param_grid,refit=True, verbose=3)

In [None]:

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
[CV] c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=True 
[CV]  c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=True, score=0.8924012158054712, total=45.5min
[CV] c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 49.2min remaining:    0.0s


[CV]  c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=True, score=0.8191731894520861, total=48.3min
[CV] c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 101.1min remaining:    0.0s


[CV]  c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=True, score=0.7826202598981685, total=45.0min
[CV] c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=False, score=0.8924012158054712, total=11.3min
[CV] c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=False, score=0.8191731894520861, total= 9.4min
[CV] c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=3, c__gamma=0.1, c__kernel=linear, c__probability=False, score=0.7826202598981685, total= 9.1min
[CV] c__C=0.1, c__degree=3, c__gamma=1, c__kernel=linear, c__probability=True 
[CV]  c__C=0.1, c__degree=3, c__gamma=1, c__kernel=linear, c__probability=True, score=0.8924012158054712, total=43.3min
[CV] c__C=0.1, c__degree=3, c__gamma=1, c__kernel=linear, c__pro

[CV]  c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=True, score=0.8191731894520861, total=38.4min
[CV] c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=True 
[CV]  c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=True, score=0.7826202598981685, total=37.3min
[CV] c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=False, score=0.8924012158054712, total= 8.0min
[CV] c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=False, score=0.8191731894520861, total= 9.4min
[CV] c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=4, c__gamma=10, c__kernel=linear, c__probability=False, score=0.7826202598981685, total= 8.0min
[CV] c__C=0.1, c__degree=4, c__gamma=100, c__kernel=linear, c__probab

[CV]  c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=True, score=0.8924012158054712, total=37.9min
[CV] c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=True 
[CV]  c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=True, score=0.8191731894520861, total=38.9min
[CV] c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=True 
[CV]  c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=True, score=0.7826202598981685, total=37.6min
[CV] c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=False, score=0.8924012158054712, total= 7.9min
[CV] c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=False 
[CV]  c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=linear, c__probability=False, score=0.8191731894520861, total= 8.4min
[CV] c__C=0.1, c__degree=5, c__gamma=1000, c__kernel=

In [None]:
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
import joblib
joblib.dump(pipeline,open('Ticket_Type_SVM_Model_withoutClient_15march.psm','wb'))

In [None]:
prob =pipeline.predict_proba(X_test)

#Convert to list
y_list = y_test.values.tolist()
X_list = X_test.values.tolist()

prob = prob.tolist()

In [None]:
conf = [max(prob[i]) for i in range(len(prob))]

In [None]:
act_data= A_test.values.tolist()
# act_data=df1['Description']
cat=n_test.values.tolist()

In [None]:
#Create Final DataFrame
d = { 'Prediction_Template': pred,'Category':cat,'Orignal_Description':act_data,'Clean_Description':X_test}
y_df = pd.DataFrame(d)
y_df.info()

In [None]:
y_df

In [None]:
y_df["Result"] = y_df["Actual_Template"] == y_df["Prediction_Template"]

In [None]:
writer = pd.ExcelWriter('Results_Ticket_Type.xlsx')
y_df.to_excel(writer,"Sheet1")
writer.save()

In [None]:
len(y_df[y_df["Result"] == True])/len(y_df)

In [None]:
abovecf = y_df[y_df["Confidence"] >=0.1].copy()

In [None]:
len(abovecf[abovecf["Result"] == True])/len(abovecf)

In [None]:
allrecords = []
truerecords = []
falserecords = []
accuracy = []

In [None]:
for i in range(0,10):
    i = i * 0.10
    abovecf = y_df[y_df["Confidence"] >= i].copy()
    allrecords.append(len(abovecf))
    truerecords.append(len(abovecf[abovecf["Result"] == True]))
    falserecords.append(len(abovecf[abovecf["Result"] == False]))
    accuracy.append(len(abovecf[abovecf["Result"] == True])/len(abovecf))

In [None]:
d = {'Tickets count above confidence Threshold': allrecords,'Correctly predicted tickets': truerecords, 'Incorrectly  predicted tickets': falserecords ,'Accuracy after Threshhold':accuracy,'Threeshold' : [x * 0.1 for x in range(0, 10)]}
conf_data = pd.DataFrame(d)
conf_data.info()

In [None]:
conf_data["Overall Tickets for Testing"] = len(y_df)

In [None]:
conf_data["Tickets count routed  to manual Queue"] =conf_data["Overall Tickets for Testing"] - conf_data["Tickets count above confidence Threshold"]

In [None]:
writer = pd.ExcelWriter('Result_conf_WithoutStem.xlsx')
conf_data.to_excel(writer,"Sheet1")
writer.save()

In [None]:
cm=confusion_matrix(y_test,pred)

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
plt.plot(cm)

In [None]:
cr=classification_report(y_test,pred)

In [None]:
file=open("Classification_Report_AllCategoryExcept20 .xlsx","w")

In [None]:
file.writelines([cr])

In [None]:
a=pd.DataFrame(cm)#index=pipeline.classes_,columns=pipeline.classes_

In [None]:
a.to_excel("Confusion_matrix_AllCategoryExcept20.xlsx")