In [1]:
# Import Dependencies
%matplotlib inline

# Begin Python Imports
import datetime, warnings, scipy
warnings.filterwarnings("ignore")

# Data Manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')

In [2]:
# Import packages related to Text Analytics

import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aniq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aniq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aniq\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# Import data for Words of Closure

woc7 = pd.read_excel('data_WOC.xlsx', sheet_name = 'Batch 7')
woc8 = pd.read_excel('data_WOC.xlsx', sheet_name = 'Batch 8')
woc9 = pd.read_excel('data_WOC.xlsx', sheet_name = 'Batch 9')

In [4]:
# Append data from Batch 7 till 9

woc = pd.concat([woc7, woc8, woc9], ignore_index = True, sort = False)
woc.drop(['Name', 'Image?'], axis = 1, inplace = True)
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi..."
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...


In [5]:
# Convert data type for some numerical columns to categorical columns

woc['Semester'] = woc['Semester'].astype(str)
woc['Batch'] = woc['Batch'].astype(str)

woc.dtypes

Semester      object
Session       object
Batch         object
Matric No.    object
Text          object
dtype: object

In [6]:
# Perform word tokenization 

woc['text'] = woc['Text'].apply(lambda txt: txt.lower())
tok = RegexpTokenizer(r'[^.,!?/:;\"\s]+')
woc['text'] = woc['text'].apply(tok.tokenize)
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...","[thank, you, for, the, second, chance, hi, dr,..."
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,"[a, wonderful, experience, for, starter, good,..."
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,"[thank, you, for, the, teaching, dr, salimah, ..."
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,"[great, start, to, the, data, science, journey..."
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,"[wonderful, starter, for, data, science, dear,..."


In [7]:
# Perform lemmatization on text to obtain root words

lem = WordNetLemmatizer()

def lemmatize_text(txt):
    return [lem.lemmatize(w) for w in txt]

woc['text'] = woc.text.apply(lemmatize_text)
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...","[thank, you, for, the, second, chance, hi, dr,..."
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,"[a, wonderful, experience, for, starter, good,..."
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,"[thank, you, for, the, teaching, dr, salimah, ..."
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,"[great, start, to, the, data, science, journey..."
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,"[wonderful, starter, for, data, science, dear,..."


In [8]:
# Remove common stopwords from text

stop_words = stopwords.words('english')

woc['text2'] = woc['text'].apply(lambda txt: ' '.join([word for word in txt if word not in stop_words]))
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text,text2
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...","[thank, you, for, the, second, chance, hi, dr,...",thank second chance hi dr think remember well ...
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,"[a, wonderful, experience, for, starter, good,...",wonderful experience starter good day dr salim...
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,"[thank, you, for, the, teaching, dr, salimah, ...",thank teaching dr salimah would like express g...
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,"[great, start, to, the, data, science, journey...",great start data science journey hello dr sali...
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,"[wonderful, starter, for, data, science, dear,...",wonderful starter data science dear dr salimah...


In [9]:
# Analyze word frequency after performing text-preprocessing

new_text = woc.text2.str.split(expand=True).stack().value_counts().reset_index()
new_text.columns = ['Word', 'Frequency'] 
new_text.head(50)

Unnamed: 0,Word,Frequency
0,data,305
1,science,237
2,dr,234
3,course,231
4,thank,184
5,salimah,162
6,class,148
7,u,126
8,learning,91
9,knowledge,91


In [10]:
# Incorporate new stopwords into the list after word analysis & remove them from text

stop_words2 = ['data', 'science', 'dr', 'salimah', 'u', 'wa', 'would', 'ha', 'r', 'course', 'class', 'quiz', 'also',
              'would', 'semester', 'one', 'take', 'hi', 'dear', 'could', ')', 'shiny', '&', '14', 'app', 'iâ€™m',
              'um', 'itâ€™s', 'md', 'prof', '3', '-', 'ðŸ˜Š', 'allah', 'quizizz', 'â€', 'p', 'via', 'odl', 'serf']

new_stop_words = stop_words + stop_words2

woc['text'] = woc['text'].apply(lambda txt: ' '.join([word for word in txt if word not in new_stop_words]))
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text,text2
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...",thank second chance think remember well due cu...,thank second chance hi dr think remember well ...
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,wonderful experience starter good day first li...,wonderful experience starter good day dr salim...
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,thank teaching like express gratitude working ...,thank teaching dr salimah would like express g...
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,great start journey hello first like thank tim...,great start data science journey hello dr sali...
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,wonderful starter thank teaching guidance alon...,wonderful starter data science dear dr salimah...


In [11]:
# Drop irrelevant columns

woc.drop('text2', axis = 1, inplace = True)
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...",thank second chance think remember well due cu...
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,wonderful experience starter good day first li...
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,thank teaching like express gratitude working ...
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,great start journey hello first like thank tim...
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,wonderful starter thank teaching guidance alon...


In [12]:
# Perform VADER technique to obtain multiple scores on polarity

sia = SentimentIntensityAnalyzer()

woc['Multiple Scores'] = woc['text'].apply(lambda txt: sia.polarity_scores(txt))
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text,Multiple Scores
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...",thank second chance think remember well due cu...,"{'neg': 0.103, 'neu': 0.303, 'pos': 0.594, 'co..."
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,wonderful experience starter good day first li...,"{'neg': 0.056, 'neu': 0.546, 'pos': 0.398, 'co..."
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,thank teaching like express gratitude working ...,"{'neg': 0.046, 'neu': 0.502, 'pos': 0.452, 'co..."
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,great start journey hello first like thank tim...,"{'neg': 0.025, 'neu': 0.546, 'pos': 0.429, 'co..."
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,wonderful starter thank teaching guidance alon...,"{'neg': 0.0, 'neu': 0.658, 'pos': 0.342, 'comp..."


In [13]:
# Extract out the compound score only

woc['Score']  = woc['Multiple Scores'].apply(lambda score_dict: score_dict['compound'])
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text,Multiple Scores,Score
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...",thank second chance think remember well due cu...,"{'neg': 0.103, 'neu': 0.303, 'pos': 0.594, 'co...",0.9756
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,wonderful experience starter good day first li...,"{'neg': 0.056, 'neu': 0.546, 'pos': 0.398, 'co...",0.9975
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,thank teaching like express gratitude working ...,"{'neg': 0.046, 'neu': 0.502, 'pos': 0.452, 'co...",0.9874
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,great start journey hello first like thank tim...,"{'neg': 0.025, 'neu': 0.546, 'pos': 0.429, 'co...",0.9911
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,wonderful starter thank teaching guidance alon...,"{'neg': 0.0, 'neu': 0.658, 'pos': 0.342, 'comp...",0.9601


In [14]:
# Label the scores according to sentiments

def score_func(score):
    if score >= 0.1:
        sentiment = 'Positive' 
    elif score <= -0.1:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    return sentiment

woc['Sentiment'] = woc.apply(lambda x: score_func(x['Score']), axis = 1)
woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Text,text,Multiple Scores,Score,Sentiment
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...",thank second chance think remember well due cu...,"{'neg': 0.103, 'neu': 0.303, 'pos': 0.594, 'co...",0.9756,Positive
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,wonderful experience starter good day first li...,"{'neg': 0.056, 'neu': 0.546, 'pos': 0.398, 'co...",0.9975,Positive
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,thank teaching like express gratitude working ...,"{'neg': 0.046, 'neu': 0.502, 'pos': 0.452, 'co...",0.9874,Positive
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,great start journey hello first like thank tim...,"{'neg': 0.025, 'neu': 0.546, 'pos': 0.429, 'co...",0.9911,Positive
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,wonderful starter thank teaching guidance alon...,"{'neg': 0.0, 'neu': 0.658, 'pos': 0.342, 'comp...",0.9601,Positive


In [15]:
# Change some column names and drop some columns

woc.rename(columns = {'Text' : 'Feedback', 'text' : 'Feedback 2'}, inplace = True)
woc.drop('Multiple Scores', axis = 1, inplace = True)

woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Feedback,Feedback 2,Score,Sentiment
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...",thank second chance think remember well due cu...,0.9756,Positive
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,wonderful experience starter good day first li...,0.9975,Positive
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,thank teaching like express gratitude working ...,0.9874,Positive
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,great start journey hello first like thank tim...,0.9911,Positive
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,wonderful starter thank teaching guidance alon...,0.9601,Positive


In [16]:
# Formula: Convert a score range of [-1, 1] to 5-star rating range of [1, 5]

old_min = -1
old_max = 1 
new_min = 1 
new_max = 5

old_range = old_max - old_min
new_range = new_max - new_min

woc[['Rating']] = (((woc[['Score']] - old_min) * new_range) / old_range) + new_min
woc = woc.round({"Rating" : 2})

woc.head()

Unnamed: 0,Semester,Session,Batch,Matric No.,Feedback,Feedback 2,Score,Sentiment,Rating
0,1,2020/2021,7,17218788,"Thank you for the second chance!\nHi Dr, i thi...",thank second chance think remember well due cu...,0.9756,Positive,4.95
1,1,2020/2021,7,S2000128,A Wonderful Experience for Starter.\nGood Day ...,wonderful experience starter good day first li...,0.9975,Positive,5.0
2,1,2020/2021,7,S2018106,Thank you for the teaching Dr. Salimah.\n\nI w...,thank teaching like express gratitude working ...,0.9874,Positive,4.97
3,1,2020/2021,7,S2000949,Great Start to the Data Science Journey.\nHell...,great start journey hello first like thank tim...,0.9911,Positive,4.98
4,1,2020/2021,7,S2016012,Wonderful Starter for Data Science.\n \nDear D...,wonderful starter thank teaching guidance alon...,0.9601,Positive,4.92


In [17]:
# Display the statistics for numerical columns
woc.describe()

Unnamed: 0,Score,Rating
count,173.0,173.0
mean,0.920508,4.840578
std,0.143297,0.286453
min,0.0,3.0
25%,0.9342,4.87
50%,0.9661,4.93
75%,0.9828,4.97
max,0.9976,5.0


In [18]:
# Transform table for donut chart

woc_01 = woc[(woc['Semester'] == "1") & (woc['Session'] == "2021/2022")].groupby('Sentiment', as_index = False).agg({"Feedback": "count"})
woc_01.rename(columns = {'Feedback' : 'Frequency'}, inplace = True)
woc_01.sort_values(by = 'Sentiment', ascending = False, inplace = True)

woc_01.head()

Unnamed: 0,Sentiment,Frequency
1,Positive,50
0,Neutral,1


In [19]:
# Export main tables to be incorporated into the dashboard
woc.to_csv('data_sentiment.csv', index = False, header = True) 