# Load data for topic classification

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [10]:
data = pd.read_csv("topic_clean.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,agency,agency_short,tag,course,rating,comment,aoi,topic,clean_comment,tokenized,no_stopwords,lemmatized,body_len,vader
0,0,Institute Of Technical Education,ITE,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,5,collaboration the right tools empower the pers...,n,c,collaboration the right tools empower the pers...,"['collaboration', 'the', 'right', 'tools', 'em...","['collaboration', 'right', 'tools', 'empower',...","['collaboration', 'right', 'tool', 'empower', ...",74,0.0
1,3,Institute Of Technical Education,ITE,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC3: Communicate & Collaborate with Agility,5,Very informative,n,c,Very informative,"['very', 'informative']",['informative'],['informative'],15,0.0
2,4,Central Provident Fund Board,CPF,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC2: Communicate & Collaborate with Etiquette,5,Useful,n,c,Useful,['useful'],['useful'],['useful'],6,0.4404
3,8,National Environment Agency,NEA,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC2: Communicate & Collaborate with Etiquette,4,Good reminder,n,c,Good reminder,"['good', 'reminder', '']","['good', 'reminder', '']","['good', 'reminder', '']",12,0.4404
4,10,Housing Development Board,HDB,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,4,Very interesting,n,c,Very interesting,"['very', 'interesting']",['interesting'],['interesting'],15,0.4576


# Vectorization

In [12]:
#tf-idf vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(data['lemmatized'])

# splits into features: body length, vader score and each number represents a single word
# body length and vader score not required here. does not help with topic classification
X_tfidf_feat = pd.concat([pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2435,2436,2437,2438,2439,2440,2441,2442,2443,2444
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split data

In [13]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [14]:
# split data with X as features and y as the label
X_features = X_tfidf_feat
X_train, X_test, y_train, y_test = train_test_split(X_features, data['topic'], test_size=0.3, shuffle=True,
                                                   random_state=51, stratify=data.topic)

# Test different models

In [15]:
## function for printing results

def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

## 1. Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)
y_pred = gnb_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, average='weighted')
    
print('Precision:{} / Recall:{} / Accuracy:{}'.format(round(precision,3), round(recall,3),
    round((y_pred==y_test).sum() / len(y_pred),3)))

Precision:0.632 / Recall:0.638 / Accuracy:0.638


## 2. SVC

In [17]:
from sklearn.svm import SVC

svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'degree': [1,2,3],
    'probability': [True]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'C': 1, 'degree': 1, 'kernel': 'rbf', 'probability': True}

0.522 (+/-0.038) for {'C': 0.1, 'degree': 1, 'kernel': 'linear', 'probability': True}
0.385 (+/-0.014) for {'C': 0.1, 'degree': 1, 'kernel': 'rbf', 'probability': True}
0.523 (+/-0.038) for {'C': 0.1, 'degree': 1, 'kernel': 'poly', 'probability': True}
0.522 (+/-0.038) for {'C': 0.1, 'degree': 2, 'kernel': 'linear', 'probability': True}
0.385 (+/-0.014) for {'C': 0.1, 'degree': 2, 'kernel': 'rbf', 'probability': True}
0.383 (+/-0.01) for {'C': 0.1, 'degree': 2, 'kernel': 'poly', 'probability': True}
0.522 (+/-0.038) for {'C': 0.1, 'degree': 3, 'kernel': 'linear', 'probability': True}
0.385 (+/-0.014) for {'C': 0.1, 'degree': 3, 'kernel': 'rbf', 'probability': True}
0.376 (+/-0.008) for {'C': 0.1, 'degree': 3, 'kernel': 'poly', 'probability': True}
0.744 (+/-0.044) for {'C': 1, 'degree': 1, 'kernel': 'linear', 'probability': True}
0.753 (+/-0.04) for {'C': 1, 'degree': 1, 'kernel': 'rbf', 'probability': True}
0.74

In [20]:
# 75.3% accuracy
cv.best_estimator_

SVC(C=1, degree=1, probability=True)

In [21]:
Pkl_Filename = "t_SVM_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(cv.best_estimator_, file)

## 3. Log Reg

In [22]:
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss = 'log')
parameters = {'alpha' : [10**(-x) for x in range(7)],
             'penalty' : ['l1', 'l2', 'elasticnet'],
             'l1_ratio' : [0.15, 0.25, 0.5, 0.75]}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'alpha': 0.0001, 'l1_ratio': 0.15, 'penalty': 'elasticnet'}

0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.15, 'penalty': 'l1'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.15, 'penalty': 'l2'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.15, 'penalty': 'elasticnet'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.25, 'penalty': 'l1'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.25, 'penalty': 'l2'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.25, 'penalty': 'elasticnet'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.5, 'penalty': 'l1'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.5, 'penalty': 'l2'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.5, 'penalty': 'elasticnet'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.75, 'penalty': 'l1'}
0.374 (+/-0.06) for {'alpha': 1, 'l1_ratio': 0.75, 'penalty': 'l2'}
0.353 (+/-0.001) for {'alpha': 1, 'l1_ratio': 0.75, 'penalty': 'elasticnet'}
0.353 (+/-0.001) for {'alpha': 0.1, 'l1_ratio': 0.15, 'penalty': 'l1'

In [24]:
# 74.6% accuracy
cv.best_estimator_

SGDClassifier(loss='log', penalty='elasticnet')

In [25]:
Pkl_Filename = "t_LR_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(cv.best_estimator_, file)

## 4. Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param = {'n_estimators': [10,50,100,150,300],
        'max_depth': [10,20,30,40,50,None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,9.293742,0.086842,0.169852,0.012526,,300,"{'max_depth': None, 'n_estimators': 300}",0.71223,0.723022,0.723022,0.743682,0.722022,0.724795,0.010282,1
28,5.359812,0.155553,0.112838,0.01358,,150,"{'max_depth': None, 'n_estimators': 150}",0.697842,0.715827,0.701439,0.729242,0.750903,0.71905,0.019436,2
26,1.940761,0.120669,0.093217,0.017026,,50,"{'max_depth': None, 'n_estimators': 50}",0.697842,0.705036,0.697842,0.743682,0.732852,0.715451,0.019123,3
27,3.770647,0.135195,0.106886,0.002267,,100,"{'max_depth': None, 'n_estimators': 100}",0.690647,0.719424,0.705036,0.725632,0.732852,0.714718,0.015121,4
24,8.193935,0.355377,0.186167,0.02002,50.0,300,"{'max_depth': 50, 'n_estimators': 300}",0.701439,0.708633,0.665468,0.750903,0.729242,0.711137,0.028619,5
22,2.803002,0.151702,0.092628,0.009613,50.0,100,"{'max_depth': 50, 'n_estimators': 100}",0.708633,0.697842,0.708633,0.750903,0.685921,0.710386,0.021926,6
12,1.98105,0.028149,0.09207,0.007983,30.0,100,"{'max_depth': 30, 'n_estimators': 100}",0.690647,0.726619,0.669065,0.754513,0.707581,0.709685,0.02937,7
21,1.491142,0.140362,0.079303,0.019588,50.0,50,"{'max_depth': 50, 'n_estimators': 50}",0.683453,0.730216,0.683453,0.740072,0.707581,0.708955,0.023336,8
19,7.391132,0.267207,0.191708,0.027866,40.0,300,"{'max_depth': 40, 'n_estimators': 300}",0.683453,0.701439,0.697842,0.758123,0.693141,0.706799,0.026362,9
14,5.941183,0.192602,0.182174,0.015952,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.701439,0.71223,0.679856,0.729242,0.693141,0.703182,0.016787,10


In [27]:
# 72.5% accuracy
final_rf = RandomForestClassifier(n_estimators=300, max_depth=None, n_jobs=-1)
final_rf_model = final_rf.fit(X_train, y_train)

In [28]:
Pkl_Filename = "t_RF_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_rf_model, file)

## 5. Gradient Boosting

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
param = {'n_estimators': [10,50,100,150,300],
        'max_depth': [3,7,11,15],
        'learning_rate': [0.01, 0.1, 1]}

gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
52,59.239981,1.403495,0.040494,0.004206,1.0,11,100,"{'learning_rate': 1, 'max_depth': 11, 'n_estim...",0.697842,0.723022,0.715827,0.747292,0.693141,0.715425,0.019386,1
59,168.069896,4.573111,0.034342,0.006182,1.0,15,300,"{'learning_rate': 1, 'max_depth': 15, 'n_estim...",0.669065,0.733813,0.733813,0.732852,0.696751,0.713259,0.026285,2
34,269.380734,2.456461,0.087178,0.006982,0.1,11,300,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.705036,0.719424,0.690647,0.754513,0.689531,0.71183,0.023965,3
38,178.247127,0.91567,0.077327,0.008953,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.71223,0.71223,0.694245,0.758123,0.68231,0.711828,0.025784,4
28,88.250301,1.421572,0.058525,0.001672,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.683453,0.726619,0.71223,0.750903,0.67509,0.709659,0.027846,5
54,180.2642,1.157239,0.050577,0.004319,1.0,11,300,"{'learning_rate': 1, 'max_depth': 11, 'n_estim...",0.665468,0.730216,0.723022,0.736462,0.689531,0.70894,0.027111,6
44,78.098086,0.75308,0.060854,0.004074,1.0,3,300,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",0.665468,0.708633,0.719424,0.740072,0.703971,0.707514,0.024428,7
49,115.954874,1.915515,0.049551,0.005572,1.0,7,300,"{'learning_rate': 1, 'max_depth': 7, 'n_estima...",0.665468,0.733813,0.715827,0.750903,0.67148,0.707498,0.033792,8
57,80.833357,1.026563,0.041055,0.004092,1.0,15,100,"{'learning_rate': 1, 'max_depth': 15, 'n_estim...",0.665468,0.715827,0.715827,0.725632,0.711191,0.706789,0.021191,9
32,88.978435,1.485369,0.072233,0.018036,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.697842,0.705036,0.694245,0.743682,0.693141,0.706789,0.01891,9


In [30]:
# 85.3% accuracy
final_gb = GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.1)
final_gb_model = final_gb.fit(X_train, y_train)

In [None]:
Pkl_Filename = "t_GB_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_gb_model, file)