In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## Load the data

In [2]:
DATA_PATH = 'juniorMLE_dataset.csv'
question_data = pd.read_csv(DATA_PATH, ',')

In [3]:
# Verify that data was loaded.
question_data

Unnamed: 0,IsQuestionForCommunity,VerifiedBuyers,VisitsLastYear,QuestionTextLength,ProductGroup1Id,BrandId,CategoryManagementTeamBudgetingGroupId,CategoryManagementTeamId,QuestionText,ProductLifecycleDays,...,VerifiedBuyersLastYear,VerifiedBuyersWithLanguage,VerifiedBuyersWithLanguageAnswersTotalScore,VerifiedBuyersWithLanguageBestAnswers,VerifiedBuyersWithLanguageHaveAnswered,VerifiedBuyersWithLanguageLastMonth,VerifiedBuyersWithLanguageLastYear,VerifiedBuyersWithLanguageTotalAnswers,VerifiedBuyersWithLanguageTotalAnswersAsVerifiedBuyers,VisitsLastMonth
0,1,44,65724,77,1,15151,23,5,Geht die Kamera beim Klingeln an oder auch wen...,166,...,44,30,5,12,11,4,30,48,41,364
1,1,2,18,89,80,16030,18,8,Welche Häkelnadel kann man für dieses Garn bra...,1239,...,0,2,0,1,2,0,0,4,3,6
2,1,38,1324,40,80,778,46,8,Handelt es sich bei der flasche um glas?,117,...,38,33,2,1,2,29,33,11,11,961
3,0,47,145,155,388,378,71,4,Genau nach 3 Jahren habe ich einen defekten Bi...,1356,...,1,37,4,0,5,0,1,5,2,2
4,0,37,3127,139,389,5,56,3,"Überall steht, dass dieser Rechner sehr laut s...",77,...,37,32,4,7,8,10,32,22,22,1576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72427,1,251,5432,8,80,1145,46,8,Made in?,367,...,251,145,13,11,17,18,145,64,49,763
72428,1,116,8540,171,389,4,57,10,Guten Abend. Wie lang sind die Schläuche? Dies...,378,...,116,86,28,21,17,23,86,100,89,1545
72429,1,3024,4269,124,29,4771,17,11,Mir ist nicht klar was ich für 8.10 CHF erhalt...,1728,...,1525,2185,789,482,448,43,989,2305,1345,202
72430,1,59,5261,227,389,11473,31,10,Ist der tisch tatsächlich 82cm hoch?? Oder ist...,699,...,44,43,4,4,12,11,31,25,21,1434


In [4]:
LABEL = 'IsQuestionForCommunity'

## One hot encoding

In [5]:
def get_one_hot(df, column):
    return pd.get_dummies(df[column], prefix=column)

def to_one_hot(df, columns):
    one_hot_dfs = [get_one_hot(df, column) for column in columns]
    dfs = [df.drop(columns, 'columns')] + one_hot_dfs
    return pd.concat(dfs, axis=1)

question_data_one_hot = to_one_hot(question_data, ['ProductGroup1Id', 'BrandId', 'CategoryManagementTeamBudgetingGroupId', 'CategoryManagementTeamId'])
question_data_one_hot.head()

Unnamed: 0,IsQuestionForCommunity,VerifiedBuyers,VisitsLastYear,QuestionTextLength,QuestionText,ProductLifecycleDays,ProductQuestions,ProductQuestionsFractionAnswered,ProductQuestionsFractionAnsweredWithinWeek,ProductQuestionsHaveNonEmployeeAnswers,...,CategoryManagementTeamId_5,CategoryManagementTeamId_6,CategoryManagementTeamId_7,CategoryManagementTeamId_8,CategoryManagementTeamId_9,CategoryManagementTeamId_10,CategoryManagementTeamId_11,CategoryManagementTeamId_12,CategoryManagementTeamId_13,CategoryManagementTeamId_14
0,1,44,65724,77,Geht die Kamera beim Klingeln an oder auch wen...,166,1,1.0,1.0,1,...,1,0,0,0,0,0,0,0,0,0
1,1,2,18,89,Welche Häkelnadel kann man für dieses Garn bra...,1239,0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,38,1324,40,Handelt es sich bei der flasche um glas?,117,0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,47,145,155,Genau nach 3 Jahren habe ich einen defekten Bi...,1356,2,0.5,0.5,1,...,0,0,0,0,0,0,0,0,0,0
4,0,37,3127,139,"Überall steht, dass dieser Rechner sehr laut s...",77,1,1.0,1.0,1,...,0,0,0,0,0,0,0,0,0,0


## Decision Tree Classifier

In [6]:
data_X = np.array(question_data.drop([LABEL, 'QuestionText'], 'columns'))
data_y = np.array(question_data.loc[:, [LABEL]]).flatten()
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, stratify=data_y, shuffle=True, random_state=37)

In [7]:
clf = DecisionTreeClassifier(max_depth=5, random_state=37)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, random_state=37)

In [8]:
accuracy_train = clf.score(X_train, y_train)
accuracy_train

0.775632064889119

In [9]:
accuracy_test = clf.score(X_test, y_test)
accuracy_test

0.7713122109477463

## SVM Classifier

In [None]:
clf = make_pipeline(StandardScaler(), svm.SVC(kernel='rbf', random_state=37))
clf.fit(X_train, y_train)

In [None]:
accuracy_train = clf.score(X_train, y_train)
accuracy_train

In [None]:
accuracy_test = clf.score(X_test, y_test)
accuracy_test