In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lda
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix, coo_matrix
from nltk import classify
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from stemming.porter2 import stem

In [2]:
#clean up strings by replacing contractions, similar words

def cleanstrings(x):
    x = str(x)
    x=x.lower()
    in_strings = ["can't", "couldn't", "hasn't", "haven't", "wasn't", "weren't", "won't", "didn't", "doesn't", "don't", "'ve", "i'm"
                  , "sign-in", "sign in", "sing-on", "signon", "sign on", "365"]
    replace_strings = ["cannot", "could not", "has not", "have not", "was not", "were not", "will not", "did not", "does not",
                       "do not", " have", "I am", "signin", "signin" , "signin", "signin", "signin", "threesixtyfive"]
    for i in range(len(in_strings)):
        x = x.replace(in_strings[i], replace_strings[i])
    x = re.sub("[0-9!@#$%^&*()-_=+{[}]|\:;,.<>?/]", " ", x)
    #replace multiple spaces with one space
    x = x.replace(' +', " ")
    #stem words
    x = [" ".join(stem(word) for word in x.split(" "))]
    return x[0]

In [3]:
cleandata = 0 # if cleandata = 1, then read in raw data and clean it, otherwise read stored data

In [4]:
if cleandata==0:
    dat = pd.read_csv("E:/oneml-office365-support/TrainCleaned.csv", engine='python')
    test_dat = pd.read_csv("E:/oneml-office365-support/TestCleaned.csv", engine='python')
else:
    dat = pd.read_table('E:/oneml-office365-support/TrainingData.tsv', encoding='utf-8-sig')
    test_dat = pd.read_table('E:/oneml-office365-support/TestingData.tsv', encoding='utf-8-sig')  
    #run clean strings on ist fields, title, problem and error columns
    dat['IST_1'] = dat.IST_1.apply(cleanstrings)
    dat['IST_2'] = dat.IST_2.apply(cleanstrings)
    dat['IST_3'] = dat.IST_3.apply(cleanstrings)
    dat['title'] = dat.title.apply(cleanstrings)
    dat['problem'] = dat.problem.apply(cleanstrings)
    dat['error'] = dat.error.apply(cleanstrings)
    #run clean strings on ist fields, title, problem and error columns
    test_dat['IST_1'] = test_dat.IST_1.apply(cleanstrings)
    test_dat['IST_2'] = test_dat.IST_2.apply(cleanstrings)
    test_dat['IST_3'] = test_dat.IST_3.apply(cleanstrings)
    test_dat['title'] = test_dat.title.apply(cleanstrings)
    test_dat['problem'] = test_dat.problem.apply(cleanstrings)
    test_dat['error'] = test_dat.error.apply(cleanstrings)
    dat.to_csv("E:/oneml-office365-support/TrainCleaned.csv", index=False)
    test_dat.to_csv("E:/oneml-office365-support/TestCleaned.csv", index=False)
        

In [48]:
#part of speech tagging
postag = 1

In [None]:
'''if postag == 0:
    dat_pos = pd.read_csv("E:/oneml-office365-support/TrainPOS.csv", engine='python')
    test_pos = pd.read_csv("E:/oneml-office365-support/TestPOS.csv", engine='python')
else:
    dat = pd.read_table('E:/oneml-office365-support/TrainingData.tsv', encoding='utf-8-sig')
    test_dat = pd.read_table('E:/oneml-office365-support/TestingData.tsv', encoding='utf-8-sig')
    '''

In [5]:
#combine CST_1 and CST_2 into a label column
def getlabel(cols):
    return cols.CST_1 + '||' + cols.CST_2

dat['label'] = dat.apply(getlabel, axis=1)
#test_dat['label'] = dat.apply(getlabel, axis=1)

In [6]:
#create a dictionary to convert labels to numbers 
labeldf = dict(zip(dat.label.unique(), np.arange(len(dat.label.unique()))))

In [7]:
dat['labelind'] = [labeldf[x] for x in dat.label.values]

In [8]:
#create dictionaries from CST fields (note these aren't used later on)
cst1_df = dict(zip(dat.CST_1.unique(), np.arange(len(dat.CST_1.unique()))))
cst2_df = dict(zip(dat.CST_2.unique(), np.arange(len(dat.CST_2.unique()))))

In [9]:
#create features from the IST fields (these aren't used later on)
ist1_df = dict(zip(dat.IST_1.unique(), np.arange(len(dat.IST_1.unique()))))
ist2_df = dict(zip(dat.IST_2.unique(), np.arange(len(dat.IST_2.unique()))))
ist3_df = dict(zip(dat.IST_3.unique(), np.arange(len(dat.IST_3.unique()))))

In [10]:
dat['ist1'] = [ist1_df[x] for x in dat.IST_1.values]
dat['ist2'] = [ist2_df[x] for x in dat.IST_2.values]
dat['ist3'] = [ist3_df[x] for x in dat.IST_3.values]

In [11]:
dat['cst1'] = [cst1_df[x] for x in dat.CST_1.values]
dat['cst2'] = [cst2_df[x] for x in dat.CST_2.values]

In [12]:
# some problem fields are NaN. Replace those with empty strings
dat.problem = dat.problem.fillna('')
test_dat.problem = test_dat.problem.fillna('')

#### Get first sentence of 'problem' field and create features just based off of that

In [13]:
def firstsentence(x):
    return x.split('.')[0]

In [14]:
dat['first_sentence'] = dat.problem.apply(firstsentence)
test_dat['first_sentence'] = test_dat.problem.apply(firstsentence)

###Train logistic regression model

In [15]:
trainind = pd.read_csv("E:/oneml-office365-support/trainind.csv", header=None)
trainind = trainind.ix[:,0].values

In [16]:
#trainind = np.random.rand(len(dat)) < 0.7
train = dat[trainind]
test = dat[~trainind]

In [17]:
#pd.Series(trainind).to_csv("E:/oneml-office365-support/trainind.csv", index=False)

In [79]:
train.columns

Index(['CST_1', 'CST_2', 'IST_1', 'IST_2', 'IST_3', 'title', 'problem', 'error', 'SRId', 'label', 'labelind', 'ist1', 'ist2', 'ist3', 'cst1', 'cst2', 'first_sentence'], dtype='object')

In [80]:
#transform the data using TFidf. This function takes in a training set and a testing set. The training set is used to fit the TF
# IDF vectorizer, and then it is applied to the test set fields. Both train and test results are outputted

def tfidftransform(train, test, min_df=0.0005):
    #cv = TfidfVectorizer(stop_words='english', min_df=0.0005)
    cv = TfidfVectorizer(max_df=0.5 min_df=0.0005)
    cv.fit(train.label)
    #
    probmat = cv.transform(train.problem.fillna(''))
    probmat_test = cv.transform(test.problem.fillna(''))
    #
    probmat2= cv.fit_transform(train.problem.fillna(''))
    probmat2_test = cv.transform(test.problem.fillna(''))
    #
    #first = cv.fit_transform(train.first_sentence)
    #first_test = cv.transform(test.first_sentence)
    first = cv.fit_transform(train.problem)
    first_test = cv.transform(test.problem)
    #
    ist_1 = cv.fit_transform(train.IST_1.fillna(''))
    ist_1_test = cv.transform(test.IST_1.fillna(''))
    #
    ist_2 = cv.fit_transform(train.IST_2.fillna(''))
    ist_2_test = cv.transform(test.IST_2.fillna(''))
    #
    ist_3 = cv.fit_transform(train.IST_3.fillna(''))
    ist_3_test = cv.transform(test.IST_3.fillna(''))
    #
    title = cv.fit_transform(train.title.fillna(''))
    title_test = cv.transform(test.title.fillna(''))
    #
    error = cv.fit_transform(train.error.fillna(''))
    error_test = cv.transform(test.error.fillna(''))
    X = hstack((probmat, probmat2, first, ist_1, ist_2, ist_3, title, error))
    X_test = hstack((probmat_test, probmat2_test, first_test, ist_1_test, ist_2_test, ist_3_test, title_test, error_test))
    return X, X_test



SyntaxError: invalid syntax (<ipython-input-80-d1b74dd4d42d>, line 6)

In [81]:
X_train, X_test = tfidftransform(train, test)

In [82]:
y = train.label
y_test = test.label

In [83]:
# loop over regularization constants to tune logistic regression
#reg_const = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5]#, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
#reg_const = [0.4]
#for i in reg_const:
#    model = LogisticRegression(C=i)
#    model.fit(X,y)
#    train_acc = model.score(X, y)
#    cv_acc = model.score(X_test, y_test)
#    print(i, train_acc, cv_acc)

In [84]:
#predictions for CV with tuned regularization parameter
model = LogisticRegression(C=0.4)
model.fit(X_train,y)
train_acc = model.score(X_train, y)
cv_acc = model.score(X_test, y_test)
print(train_acc, cv_acc)
dm_preds = model.predict(X_test)

0.689826703056 0.629892798665


In [85]:
pd.Series(dm_preds).to_csv("E:/oneml-office365-support/dm_preds.csv", index=False)

In [86]:
#dm_preds = pd.read_csv("E:/oneml-office365-support/dm_preds.csv", header=None)
#dm_preds = dm_preds.ix[:,1].values
#dm_preds_new = [labeldf[x] for x in dm_preds]

In [87]:
#pd.Series(dm_preds_new).to_csv("E:/oneml-office365-support/dm_preds_new.csv", index=False)

###train on total training set

In [88]:
X, X_out = tfidftransform(dat, test_dat)
y = dat.label

In [89]:
model = LogisticRegression(C=0.4)
model.fit(X,y)
train_acc = model.score(X, y)
print(train_acc)
final_dm_preds = model.predict(X_out)

0.684533957845


In [90]:
def splitlabel(x):
    return x.split("||")

final_dm_labels = pd.Series(final_dm_preds).apply(splitlabel)

In [91]:
finaldf = pd.DataFrame()
finaldf['SRId'] = test_dat.SRId
cst1 = []
cst2 = []
for i in range(len(final_dm_labels)):
    cst1.append(final_dm_labels[i][0])
    cst2.append(final_dm_labels[i][1])
finaldf['CST_1'] = cst1
finaldf['CST_2'] = cst2

In [92]:
finaldf.to_csv("E:/oneml-office365-support/submission_codalab_09_23.tsv", index=False, sep='\t')

#### Try heiarchical clustering

In [93]:
#first train model to predict CST_1
y_1 = train.CST_1
y_1_test = test.CST_1
model = LogisticRegression(C=0.4)
model.fit(X_train, y_1)
train_acc = np.mean(model.predict(X_train)==y_1)
cv_acc = np.mean(model.predict(X_test)==y_1_test)
print(train_acc, cv_acc)

0.909748457267 0.898822947882


In [94]:
pred_class_1 = model.predict(X_test)

In [95]:
pd.Series(pred_class_1).to_csv("E:/oneml-office365-support/pred_class_1.csv")

In [96]:
X_csr = X_train.tocsr()
X_csr_test = X_test.tocsr()

In [97]:
#Train a model for each CST_1
model = LogisticRegression(C=0.4)
unique_cst_1 = dat.CST_1.unique()
tot_correct = 0
hm_preds = np.zeros(len(test))
for cst in unique_cst_1:
    index = np.where(train.CST_1==cst)[0]
    index2 = np.where(pred_class_1 == cst)[0]
    y_1 = train.labelind.values[index]
    y_1_test = test.labelind.values[index2]
    model.fit(X_csr[index], y_1)
    cv_acc = model.score(X_csr_test[index2], y_1_test)
    hm_preds[index2] = model.predict(X_csr_test[index2])
    tot_correct += cv_acc*len(index2)
    print(cst, cv_acc, len(index2))

O365 User and Domain Mgmt 0.718057022175 4735
Exchange Online 0.574814535395 15906
Office Pro Plus for O365 0.709060213844 1777
SharePoint Online 0.589852880517 7273
Lync and Skype Online 0.820388349515 2678


In [98]:
#pd.Series(hm_preds).to_csv("E:/oneml-office365-support/hm_preds.csv")

In [99]:
tot_correct/len(test)

0.62683431678457779

In [100]:
unique_cst_1

array(['O365 User and Domain Mgmt', 'Exchange Online',
       'Office Pro Plus for O365', 'SharePoint Online',
       'Lync and Skype Online'], dtype=object)

In [101]:
dm_preds_new = np.array([labeldf[x] for x in dm_preds])

In [102]:
final_preds = np.zeros(len(hm_preds))

In [103]:
replace_index = (pred_class_1 == "Exchange Online") | (pred_class_1 == "SharePoint Online")

In [104]:
final_preds[replace_index] = dm_preds_new[replace_index]

In [105]:
final_preds[~replace_index] = hm_preds[~replace_index]

In [106]:
np.mean(final_preds==test.labelind)

0.63075782384380119

In [107]:
np.mean(hm_preds==test.labelind)

0.62683431678457779

In [108]:
np.mean(dm_preds_new==test.labelind)

0.62989279866538972