# Load data for AOI classification

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
data = pd.read_csv("aoi_clean.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,agency,agency_short,tag,course,rating,comment,aoi,topic,clean_comment,tokenized,no_stopwords,lemmatized,body_len,vader
0,0,Institute Of Technical Education,ITE,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,5,collaboration the right tools empower the pers...,n,c,collaboration the right tools empower the pers...,"['collaboration', 'the', 'right', 'tools', 'em...","['collaboration', 'right', 'tools', 'empower',...","['collaboration', 'right', 'tool', 'empower', ...",74,0.0
1,1,National Environment Agency,NEA,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,5,Nil,n,,Nil,"['nil', '']","['nil', '']","['nil', '']",3,0.0
2,2,Health Sciences Authority,HSA,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,5,Good,n,,Good,"['good', '']","['good', '']","['good', '']",4,0.4404
3,3,Institute Of Technical Education,ITE,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC3: Communicate & Collaborate with Agility,5,Very informative,n,c,Very informative,"['very', 'informative']",['informative'],['informative'],15,0.0
4,4,Central Provident Fund Board,CPF,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC2: Communicate & Collaborate with Etiquette,5,Useful,n,c,Useful,['useful'],['useful'],['useful'],6,0.4404


# Perform vectorizing on data

In [3]:
#tf-idf vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(data['lemmatized'])

# splits into features: body length, vader score and each number represents a single word
X_tfidf_feat = pd.concat([data['body_len'], data['vader'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_feat

Unnamed: 0,body_len,vader,0,1,2,3,4,5,6,7,...,2367,2368,2369,2370,2371,2372,2373,2374,2375,2376
0,74,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,4,0.4404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,15,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,6,0.4404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,80,-0.6478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2096,242,0.6908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2097,35,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.436504,0.0,0.0,0.0,0.0,0.0,0.0
2098,22,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


# Split data into training & validation set

In [4]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [5]:
# split data with X as features and y as the label
X_features = X_tfidf_feat
X_train, X_test, y_train, y_test = train_test_split(X_features, data['aoi'], test_size=0.3, shuffle=True,
                                                   random_state=51, stratify=data.aoi)

# Test different models

In [6]:
## function for printing results

def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

## 1. Naive Bayes

In [6]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)
y_pred = gnb_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, average='weighted')
    
print('Precision:{} / Recall:{} / Accuracy:{}'.format(round(precision,3), round(recall,3),
    round((y_pred==y_test).sum() / len(y_pred),3)))

Precision:0.814 / Recall:0.811 / Accuracy:0.811


## 2. Support Vector Classification

In [None]:
from sklearn.svm import SVC

svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'degree': [1,2,3],
    'probability': [True]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

In [10]:
# 87.3% accuracy
cv.best_estimator_

SVC(C=1, degree=1, kernel='linear', probability=True)

In [11]:
Pkl_Filename = "SVM_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(cv.best_estimator_, file)

## 3. Log Reg

In [12]:
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss = 'log')
parameters = {'alpha' : [10**(-x) for x in range(7)],
             'penalty' : ['l1', 'l2', 'elasticnet'],
             'l1_ratio' : [0.15, 0.25, 0.5, 0.75]}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'alpha': 0.001, 'l1_ratio': 0.25, 'penalty': 'l1'}

0.616 (+/-0.07) for {'alpha': 1, 'l1_ratio': 0.15, 'penalty': 'l1'}
0.638 (+/-0.078) for {'alpha': 1, 'l1_ratio': 0.15, 'penalty': 'l2'}
0.616 (+/-0.069) for {'alpha': 1, 'l1_ratio': 0.15, 'penalty': 'elasticnet'}
0.621 (+/-0.058) for {'alpha': 1, 'l1_ratio': 0.25, 'penalty': 'l1'}
0.631 (+/-0.073) for {'alpha': 1, 'l1_ratio': 0.25, 'penalty': 'l2'}
0.611 (+/-0.073) for {'alpha': 1, 'l1_ratio': 0.25, 'penalty': 'elasticnet'}
0.61 (+/-0.046) for {'alpha': 1, 'l1_ratio': 0.5, 'penalty': 'l1'}
0.642 (+/-0.093) for {'alpha': 1, 'l1_ratio': 0.5, 'penalty': 'l2'}
0.596 (+/-0.059) for {'alpha': 1, 'l1_ratio': 0.5, 'penalty': 'elasticnet'}
0.594 (+/-0.064) for {'alpha': 1, 'l1_ratio': 0.75, 'penalty': 'l1'}
0.641 (+/-0.068) for {'alpha': 1, 'l1_ratio': 0.75, 'penalty': 'l2'}
0.626 (+/-0.048) for {'alpha': 1, 'l1_ratio': 0.75, 'penalty': 'elasticnet'}
0.61 (+/-0.053) for {'alpha': 0.1, 'l1_ratio': 0.15, 'penalty': 'l1'}
0.635 (+/

In [13]:
# 79.7% accuracy
cv.best_estimator_

SGDClassifier(alpha=0.001, l1_ratio=0.25, loss='log', penalty='l1')

In [14]:
Pkl_Filename = "LR_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(cv.best_estimator_, file)

## 4. Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param = {'n_estimators': [10,50,100,150,300],
        'max_depth': [10,20,30,40,50,None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
28,3.117449,0.19438,0.073158,0.004523,,150,"{'max_depth': None, 'n_estimators': 150}",0.87415,0.860544,0.863946,0.867347,0.853741,0.863946,0.006803,1
26,1.125906,0.051137,0.049969,0.004443,,50,"{'max_depth': None, 'n_estimators': 50}",0.863946,0.870748,0.863946,0.867347,0.843537,0.861905,0.009524,2
29,5.359234,0.123851,0.123373,0.017765,,300,"{'max_depth': None, 'n_estimators': 300}",0.870748,0.860544,0.846939,0.877551,0.846939,0.860544,0.012358,3
14,3.827953,0.524955,0.105812,0.025808,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.880952,0.846939,0.836735,0.884354,0.846939,0.859184,0.019551,4
24,5.188733,0.335675,0.102779,0.007963,50.0,300,"{'max_depth': 50, 'n_estimators': 300}",0.87415,0.860544,0.85034,0.870748,0.833333,0.857823,0.014811,5
23,2.603857,0.160661,0.061191,0.001589,50.0,150,"{'max_depth': 50, 'n_estimators': 150}",0.870748,0.857143,0.853741,0.867347,0.833333,0.856463,0.013156,6
22,1.831323,0.141335,0.064071,0.013476,50.0,100,"{'max_depth': 50, 'n_estimators': 100}",0.880952,0.860544,0.85034,0.860544,0.829932,0.856463,0.01658,6
16,0.820519,0.149167,0.044681,0.009042,40.0,50,"{'max_depth': 40, 'n_estimators': 50}",0.880952,0.846939,0.857143,0.863946,0.829932,0.855782,0.01702,8
19,4.345977,0.120412,0.09759,0.015585,40.0,300,"{'max_depth': 40, 'n_estimators': 300}",0.867347,0.860544,0.860544,0.857143,0.833333,0.855782,0.011704,9
21,1.133811,0.085332,0.058805,0.012304,50.0,50,"{'max_depth': 50, 'n_estimators': 50}",0.863946,0.85034,0.853741,0.867347,0.840136,0.855102,0.009764,10


In [10]:
# 86.3% accuracy
final_rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)
final_rf_model = final_rf.fit(X_train, y_train)

In [11]:
Pkl_Filename = "RF_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_rf_model, file)

## 5. Gradient Boosting

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
param = {'n_estimators': [10,50,100,150,300],
        'max_depth': [3,7,11,15],
        'learning_rate': [0.01, 0.1, 1]}

gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,11.882975,0.328238,0.033599,0.003057,0.1,3,300,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.852679,0.879464,0.857143,0.834821,0.84375,0.853571,0.015047,1
27,8.18875,0.175839,0.032896,0.003066,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.84375,0.866071,0.861607,0.830357,0.848214,0.85,0.012815,2
29,24.8598,0.794108,0.030116,0.000651,0.1,7,300,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.839286,0.866071,0.857143,0.830357,0.848214,0.848214,0.012627,3
28,11.847951,0.32102,0.029598,0.001192,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.830357,0.861607,0.861607,0.830357,0.852679,0.847321,0.01423,4
23,6.212799,0.260762,0.0329,0.002802,0.1,3,150,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.857143,0.866071,0.839286,0.834821,0.839286,0.847321,0.012111,4
58,24.530845,0.471385,0.029395,0.003239,1.0,15,150,"{'learning_rate': 1, 'max_depth': 15, 'n_estim...",0.84375,0.879464,0.852679,0.803571,0.857143,0.847321,0.02484,6
57,16.379718,0.648274,0.030473,0.000912,1.0,15,100,"{'learning_rate': 1, 'max_depth': 15, 'n_estim...",0.84375,0.861607,0.852679,0.821429,0.857143,0.847321,0.01423,6
22,4.336344,0.336214,0.04619,0.014326,0.1,3,100,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.852679,0.852679,0.848214,0.825893,0.852679,0.846429,0.010412,8
37,14.523495,0.351724,0.036335,0.006619,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.825893,0.857143,0.875,0.821429,0.848214,0.845536,0.019885,9
56,8.394164,0.456084,0.03238,0.005171,1.0,15,50,"{'learning_rate': 1, 'max_depth': 15, 'n_estim...",0.834821,0.857143,0.857143,0.8125,0.866071,0.845536,0.01948,9


In [19]:
# 85.3% accuracy
final_gb = GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.1)
final_gb_model = final_gb.fit(X_train, y_train)

In [20]:
Pkl_Filename = "GB_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_gb_model, file)