In [37]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Load training data

In [38]:
train_data = pd.read_csv('aoi_clean.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,agency,agency_short,tag,course,rating,comment,aoi,topic,clean_comment,tokenized,no_stopwords,lemmatized,body_len,vader
0,0,Institute Of Technical Education,ITE,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,5,collaboration the right tools empower the pers...,n,c,collaboration the right tools empower the pers...,"['collaboration', 'the', 'right', 'tools', 'em...","['collaboration', 'right', 'tools', 'empower',...","['collaboration', 'right', 'tool', 'empower', ...",74,0.0
1,1,National Environment Agency,NEA,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,5,Nil,n,,Nil,"['nil', '']","['nil', '']","['nil', '']",3,0.0
2,2,Health Sciences Authority,HSA,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC1: Communicate & Collaborate with the Rig...,5,Good,n,,Good,"['good', '']","['good', '']","['good', '']",4,0.4404
3,3,Institute Of Technical Education,ITE,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC3: Communicate & Collaborate with Agility,5,Very informative,n,c,Very informative,"['very', 'informative']",['informative'],['informative'],15,0.0
4,4,Central Provident Fund Board,CPF,(BDLCC) Basic Digital Literacy: Communication ...,BDLCC2: Communicate & Collaborate with Etiquette,5,Useful,n,c,Useful,['useful'],['useful'],['useful'],6,0.4404


# Load full data

In [39]:
full_data = pd.read_csv('BDL Feedback 31 May 2020.csv')
full_data.head()

Unnamed: 0,Agency,Agency_short(HR Pulse),Department,Job Function,Imported Designation,Updated Designation,Course Domain,Tag,CourseID,Course Name,Rated Date Time (SGT),Rating,Comment
0,Housing Development Board,HDB,,,Hg Administrative Executive,,,"Basic Digital Literacy: Cyber Security,Prototy...",131976,BDLQ1: Cyber & Data Security Quiz,1/6/2020 9:26,5,Awareness on Cyber Security
1,Housing Development Board,HDB,,,Hg Tech Executive (Building),,Digitalisation,Prototype Testing Space,95206,BDLCD2: Data Security,1/6/2020 9:25,5,
2,Housing Development Board,HDB,,,Hg Tech Executive (Building),,Digital,(BDLCD) Basic Digital Literacy: Cyber & Data S...,128176,BDLCD1: Cyber Security,1/6/2020 9:24,5,
3,Vital,MOF,,Digitalisation Services CS VITAL,Support Officer,,,"Basic Digital Literacy: Cyber Security,Prototy...",131976,BDLQ1: Cyber & Data Security Quiz,1/6/2020 9:24,3,
4,Housing Development Board,HDB,,,Finance Executive,,,"Basic Digital Literacy: Cyber Security,Prototy...",131976,BDLQ1: Cyber & Data Security Quiz,1/6/2020 9:23,5,Good


In [40]:
# drop irrelevant columns
full_data.drop(columns=['Department', 'Job Function', 'Imported Designation','Updated Designation', 'Course Domain', 'CourseID','Rated Date Time (SGT)'],inplace=True)

# rename columns
full_data.rename({'Agency': 'agency', 'Agency_short(HR Pulse)': 'agency_short', 'Tag': 'tag', 'Course Name': 'course', 'Rating': 'rating', 'Comment': 'comment'}, axis=1, inplace=True)

## drop optional modules
full_data = full_data[~full_data.course.str.contains("Optional")]

## drop quizzes
full_data = full_data[~full_data.course.str.contains("Quiz")]

# drop empty comments
full_data.dropna(subset=['comment'],inplace=True)
full_data.head()

Unnamed: 0,agency,agency_short,tag,course,rating,comment
5,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID5: Boost Your Data Visualisations,4,Good
8,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL4: How Technology has Improved the Workplace,4,Ok
11,Housing Development Board,HDB,Prototype Testing Space,BDLCD2: Data Security,4,Useful.
12,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL2: Global Technology Trends,4,"Ok,good."
15,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID4: Presenting and Interpreting Data,4,Good


# Perform cleaning for full dataset

In [41]:
import string
import re
import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [42]:
## functions for cleaning tasks

def remove_punct(text):
    no_punct = ''.join(char for char in text if char not in string.punctuation)
    return no_punct

# creates a list of words
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

# remove common words with no meaning e.g. connectors
def remove_stopwords(token_list):
    text = [word for word in token_list if word not in stopwords]
    return text

wn = nltk.WordNetLemmatizer()

# convert words into their root forms
def lemmatize(text):
    lemmatized_text = [wn.lemmatize(word) for word in text]
    return lemmatized_text

In [43]:
full_data['clean_comment'] = full_data['comment'].apply(lambda x: remove_punct(x))
full_data['tokenized'] = full_data['clean_comment'].apply(lambda x: tokenize(x.lower()))
full_data['no_stopwords'] = full_data['tokenized'].apply(lambda x: remove_stopwords(x))
full_data['lemmatized'] = full_data['no_stopwords'].apply(lambda x: lemmatize(x))

In [44]:
# this is to remove a bug in the code
full_data.to_csv('temp1.csv')
full_data = pd.read_csv('temp1.csv')
full_data.head()

Unnamed: 0.1,Unnamed: 0,agency,agency_short,tag,course,rating,comment,clean_comment,tokenized,no_stopwords,lemmatized
0,5,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID5: Boost Your Data Visualisations,4,Good,Good,['good'],['good'],['good']
1,8,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL4: How Technology has Improved the Workplace,4,Ok,Ok,['ok'],['ok'],['ok']
2,11,Housing Development Board,HDB,Prototype Testing Space,BDLCD2: Data Security,4,Useful.,Useful,['useful'],['useful'],['useful']
3,12,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL2: Global Technology Trends,4,"Ok,good.",Okgood,['okgood'],['okgood'],['okgood']
4,15,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID4: Presenting and Interpreting Data,4,Good,Good,['good'],['good'],['good']


# Create features for full dataset

In [45]:
# body length
full_data['body_len'] = full_data['comment'].apply(lambda x: len(x) - x.count(' '))

In [46]:
# in-built sentiment scorer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    compound_score = score['compound']
    return compound_score

In [47]:
full_data['vader'] = full_data['comment'].apply(lambda x: sentiment_analyzer_scores(x))    

full_data.head()

Unnamed: 0.1,Unnamed: 0,agency,agency_short,tag,course,rating,comment,clean_comment,tokenized,no_stopwords,lemmatized,body_len,vader
0,5,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID5: Boost Your Data Visualisations,4,Good,Good,['good'],['good'],['good'],4,0.4404
1,8,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL4: How Technology has Improved the Workplace,4,Ok,Ok,['ok'],['ok'],['ok'],2,0.296
2,11,Housing Development Board,HDB,Prototype Testing Space,BDLCD2: Data Security,4,Useful.,Useful,['useful'],['useful'],['useful'],7,0.4404
3,12,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL2: Global Technology Trends,4,"Ok,good.",Okgood,['okgood'],['okgood'],['okgood'],8,0.0
4,15,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID4: Presenting and Interpreting Data,4,Good,Good,['good'],['good'],['good'],4,0.4404


In [48]:
# save cleaned full dataset for use later for topic classification
full_data.to_csv('full_dataset_clean.csv')

# Vectorization

In [53]:
#for tf-idf
tfidf = TfidfVectorizer()

# fit on training data
X_tfidf = tfidf.fit(train_data['lemmatized'])

# transform test data based on training features
X_tfidf = X_tfidf.transform(full_data['lemmatized'])

X_tfidf_feat = pd.concat([full_data['body_len'], full_data['vader'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# this code here is to remove a bug which loses 2 words in training data
# hence added 2 empty columns
# l = ['2078','2079']
# for col in l:
#     X_tfidf_feat[col] = 0
    
X_tfidf_feat

Unnamed: 0,body_len,vader,0,1,2,3,4,5,6,7,...,2367,2368,2369,2370,2371,2372,2373,2374,2375,2376
0,4,0.4404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.2960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,0.4404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.4404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23334,13,0.3818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23335,164,0.6369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23336,26,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23337,4,0.4404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Load model

In [54]:
import pickle

with open('RF_Model.pkl', 'rb') as file:  
    model = pickle.load(file)

model

RandomForestClassifier(n_estimators=150, n_jobs=-1)

In [55]:
y_pred = model.predict(X_tfidf_feat)
print(y_pred)

['n' 'n' 'n' ... 'y' 'n' 'y']


In [56]:
full_data['aoi'] = y_pred
full_data

Unnamed: 0.1,Unnamed: 0,agency,agency_short,tag,course,rating,comment,clean_comment,tokenized,no_stopwords,lemmatized,body_len,vader,aoi
0,5,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID5: Boost Your Data Visualisations,4,Good,Good,['good'],['good'],['good'],4,0.4404,n
1,8,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL4: How Technology has Improved the Workplace,4,Ok,Ok,['ok'],['ok'],['ok'],2,0.2960,n
2,11,Housing Development Board,HDB,Prototype Testing Space,BDLCD2: Data Security,4,Useful.,Useful,['useful'],['useful'],['useful'],7,0.4404,n
3,12,Immigration & Checkpoints Authority,MHA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL2: Global Technology Trends,4,"Ok,good.",Okgood,['okgood'],['okgood'],['okgood'],8,0.0000,n
4,15,Housing Development Board,HDB,(BDLID) Basic Digital Literacy: Information Li...,BDLID4: Presenting and Interpreting Data,4,Good,Good,['good'],['good'],['good'],4,0.4404,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23334,172584,Temasek Polytechnic,TP,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL2: Global Technology Trends,4,Clear & concise,Clear concise,"['clear', 'concise']","['clear', 'concise']","['clear', 'concise']",13,0.3818,n
23335,172585,Maritime And Port Authority Of Singapore,MPA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL3: Digital Tools that Support My Work,4,I am more of a daily operational officer. I do...,I am more of a daily operational officer I don...,"['i', 'am', 'more', 'of', 'a', 'daily', 'opera...","['daily', 'operational', 'officer', 'dont', 'i...","['daily', 'operational', 'officer', 'dont', 'i...",164,0.6369,n
23336,172586,Maritime And Port Authority Of Singapore,MPA,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL2: Global Technology Trends,3,Haven't seen enough to review.,Havent seen enough to review,"['havent', 'seen', 'enough', 'to', 'review']","['havent', 'seen', 'enough', 'review']","['havent', 'seen', 'enough', 'review']",26,0.0000,y
23337,172587,Ministry Of Education,MOE,(BDLTL) Basic Digital Literacy: Technology Lit...,BDLTL3: Digital Tools that Support My Work,5,Good,Good,['good'],['good'],['good'],4,0.4404,n


In [57]:
full_data.drop(full_data.columns[0], axis=1, inplace=True)
full_data.to_csv('aoi_predictions.csv')