# Unmoderated feedback extraction for Universities

#### Project Contributors: Arham Anwar, Arnav Gupta, Ethan Pirso, Jatin Suri


In [None]:
import pandas as pd
import numpy as np
%pip install zstandard
import zstandard as zstd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import time
import json
import requests

pd.set_option('display.max_columns', None)

Collecting zstandard
  Downloading zstandard-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zstandard
Successfully installed zstandard-0.22.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## r/CollegeRant submissions

In this section, we delve into the data sourced from r/CollegeRant. The code is designed to explore and preprocess the data, ensuring its consistency and suitability for our subsequent analysis.


##### Data Preprocessing Steps

In [None]:
# Path to your .zst file
zst_file_path = 'CollegeRant_submissions.zst'

# Decompress .zst file
with open(zst_file_path, 'rb') as compressed:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(compressed) as reader:
        decompressed_data = reader.read().decode('utf-8')

# Assuming each line in the decompressed data is a separate JSON object
data = [json.loads(line) for line in decompressed_data.splitlines()]

# Convert the list of dictionaries to a DataFrame
df_college = pd.DataFrame(data)

# Display the first few rows of the DataFrame to verify
df_college.head()

Unnamed: 0,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,author_fullname,can_gild,can_mod_post,category,content_categories,contest_mode,created_utc,distinguished,domain,edited,gilded,hidden,hide_score,id,is_crosspostable,is_meta,is_original_content,is_reddit_media_domain,is_self,is_video,link_flair_background_color,link_flair_css_class,link_flair_richtext,link_flair_template_id,link_flair_text,link_flair_text_color,link_flair_type,locked,media,media_embed,media_only,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,pinned,pwls,quarantine,removal_reason,retrieved_on,score,secure_media,secure_media_embed,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_subscribers,subreddit_type,suggested_sort,thumbnail,thumbnail_height,thumbnail_width,title,url,whitelist_status,wls,author_cakeday,gildings,is_robot_indexable,author_patreon_flair,post_hint,preview,crosspost_parent,crosspost_parent_list,all_awardings,total_awards_received,media_metadata,allow_live_comments,discussion_type,author_premium,awarders,removed_by,removed_by_category,treatment_tags,poll_data,is_created_from_ads_ui,name,retrieved_utc,top_awarded_type,upvote_ratio,url_overridden_by_dest,gallery_data,is_gallery,banned_by,view_count,call_to_action
0,False,Jayhawker2019,1523663000.0,#dadada,,"[{'e': 'text', 't': 'Current Student/Senior'}]",870b9634-99f1-11e8-9e7c-0edfe28ae1b2,Current Student/Senior,dark,richtext,t2_16uyaec5,True,False,,,False,1533600847,,self.CollegeRant,1533602510.0,0,False,False,956pt0,True,False,False,False,True,False,,,[],,,dark,text,False,,{},False,True,0,0,False,,/r/CollegeRant/comments/956pt0/first_post_to_r...,False,,False,,1536719000.0,6,,{},This subreddit is meant for users to post any ...,True,False,False,CollegeRant,t5_muxm6,r/CollegeRant,289,public,,self,,,First Post to r/CollegeRant!,https://www.reddit.com/r/CollegeRant/comments/...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,False,Jayhawker2019,1523663000.0,#dadada,,"[{'e': 'text', 't': 'Current Student/Senior'}]",870b9634-99f1-11e8-9e7c-0edfe28ae1b2,Current Student/Senior,dark,richtext,t2_16uyaec5,True,False,,,False,1533602687,moderator,self.CollegeRant,1533602943.0,0,False,False,956xxs,True,False,False,False,True,False,,,[],,,dark,text,False,,{},False,True,6,0,False,,/r/CollegeRant/comments/956xxs/looking_for_mod...,False,,False,,1536720000.0,3,,{},"Considering this is a new subreddit, I'm looki...",True,False,True,CollegeRant,t5_muxm6,r/CollegeRant,289,public,,self,,,Looking for moderators,https://www.reddit.com/r/CollegeRant/comments/...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,False,Jayhawker2019,1523663000.0,#dadada,,"[{'e': 'text', 't': 'Current Student/Senior'}]",870b9634-99f1-11e8-9e7c-0edfe28ae1b2,Current Student/Senior,dark,richtext,t2_16uyaec5,True,False,,,False,1533614552,moderator,self.CollegeRant,1533615733.0,0,False,False,958bot,True,False,False,False,True,False,,,[],,,dark,text,False,,{},False,True,0,0,False,,/r/CollegeRant/comments/958bot/rcollegerant_gu...,False,,False,,1536720000.0,1,,{},The nature of this subreddit is for current st...,True,False,False,CollegeRant,t5_muxm6,r/CollegeRant,289,public,,self,,,r/CollegeRant Guidelines and Rules,https://www.reddit.com/r/CollegeRant/comments/...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,False,RainbowDragQueen,1531689000.0,#dadada,,"[{'e': 'text', 't': 'Current Student'}]",870b9634-99f1-11e8-9e7c-0edfe28ae1b2,Current Student,dark,richtext,t2_1s68u4to,True,False,,,False,1533615946,,self.CollegeRant,False,0,False,False,958hqc,True,False,False,False,True,False,#dadada,,"[{'e': 'text', 't': 'Current Student'}]",578e467c-99f1-11e8-8fde-0e2fe6aaf746,Current Student,dark,richtext,False,,{},False,True,11,0,False,,/r/CollegeRant/comments/958hqc/can_i_rant_abou...,False,,False,,1536720000.0,19,,{},"*on mobile, dont care about grammer right now*...",True,False,False,CollegeRant,t5_muxm6,r/CollegeRant,289,public,,self,,,Can i rant about college mental health counsel...,https://www.reddit.com/r/CollegeRant/comments/...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,False,blue-wizardA,1507665000.0,#dadada,,"[{'e': 'text', 't': 'Future Student'}]",0c019c6c-9abf-11e8-a108-0e6bf6e7a2e6,Future Student,dark,richtext,t2_gwh9cef,True,False,,,False,1533920965,moderator,self.CollegeRant,False,0,False,False,968tf1,True,False,False,False,True,False,,,[],,,dark,text,False,,{},False,True,6,0,False,,/r/CollegeRant/comments/968tf1/updated_special...,False,,False,,1536737000.0,2,,{},We now have special user flairs for (almost) e...,True,False,True,CollegeRant,t5_muxm6,r/CollegeRant,290,public,,self,,,Updated Special User Flairs.,https://www.reddit.com/r/CollegeRant/comments/...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df_college.shape

(14741, 102)

KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
# get title, selftext, url and author from df
df_college_new = df_college[['title', 'selftext', 'url', 'author']]
df_college_new.head()

Unnamed: 0,title,selftext,url,author
0,First Post to r/CollegeRant!,This subreddit is meant for users to post any ...,https://www.reddit.com/r/CollegeRant/comments/...,Jayhawker2019
1,Looking for moderators,"Considering this is a new subreddit, I'm looki...",https://www.reddit.com/r/CollegeRant/comments/...,Jayhawker2019
2,r/CollegeRant Guidelines and Rules,The nature of this subreddit is for current st...,https://www.reddit.com/r/CollegeRant/comments/...,Jayhawker2019
3,Can i rant about college mental health counsel...,"*on mobile, dont care about grammer right now*...",https://www.reddit.com/r/CollegeRant/comments/...,RainbowDragQueen
4,Updated Special User Flairs.,We now have special user flairs for (almost) e...,https://www.reddit.com/r/CollegeRant/comments/...,blue-wizardA


In [None]:
# create a new column concating title and selftext called post
df_college_new['post'] = df_college_new['title'] + ' ' + df_college_new['selftext']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_college_new['post'] = df_college_new['title'] + ' ' + df_college_new['selftext']


In [None]:
df_college_new.head()

Unnamed: 0,title,selftext,url,author,post
0,First Post to r/CollegeRant!,This subreddit is meant for users to post any ...,https://www.reddit.com/r/CollegeRant/comments/...,Jayhawker2019,First Post to r/CollegeRant! This subreddit is...
1,Looking for moderators,"Considering this is a new subreddit, I'm looki...",https://www.reddit.com/r/CollegeRant/comments/...,Jayhawker2019,Looking for moderators Considering this is a n...
2,r/CollegeRant Guidelines and Rules,The nature of this subreddit is for current st...,https://www.reddit.com/r/CollegeRant/comments/...,Jayhawker2019,r/CollegeRant Guidelines and Rules The nature ...
3,Can i rant about college mental health counsel...,"*on mobile, dont care about grammer right now*...",https://www.reddit.com/r/CollegeRant/comments/...,RainbowDragQueen,Can i rant about college mental health counsel...
4,Updated Special User Flairs.,We now have special user flairs for (almost) e...,https://www.reddit.com/r/CollegeRant/comments/...,blue-wizardA,Updated Special User Flairs. We now have speci...


In [None]:
df_college_new['post'].head(15)

0     First Post to r/CollegeRant! This subreddit is...
1     Looking for moderators Considering this is a n...
2     r/CollegeRant Guidelines and Rules The nature ...
3     Can i rant about college mental health counsel...
4     Updated Special User Flairs. We now have speci...
5     I keep getting awful grades despite studying S...
6     envious of my friends I am 18 year old and I a...
7     So my college suspensionis a bit complicated S...
8     Widthdrawing I recently started college about ...
9     Humanities professor is killing me with her cl...
10                               I hate my RA [deleted]
11               Computer class is ridiculous [deleted]
12    It's Day 11 and I already feel like I'm on the...
13    So, it's my third semester, and I am crashing ...
14    Senior year stress and parents making me feel ...
Name: post, dtype: object

In [None]:
df_college_new.shape

(14741, 5)

In [None]:
# Checking the range of the dataset
pd.to_datetime(df_college['created_utc'].min(), unit='s').isoformat(), pd.to_datetime(df_college['created_utc'].max(), unit='s').isoformat()

('2018-08-07T00:14:07', '2022-12-31T18:12:46')

In [None]:
# Combine 'title' and 'selftext' into one text
df_college_new['text'] = df_college_new['title'] + ' ' + df_college_new['selftext']

# Preprocess text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df_college_new['text'].values.astype('U'))

# Apply LDA
lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(X)

# Print top 5 topics
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10]]))

Topic #6:
Topic #7:
Topic #8:
Topic #9:
Topic #10:


In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

Topic #1:
school college degree students major university want student job people
Topic #2:
removed deleted user college math feel don cheating like semester
Topic #3:
exam class grade final just test got professor paper exams
Topic #4:
class just classes time like work semester don online know
Topic #5:
pay money college aid tuition financial https help paying need
Topic #6:
room just campus don roommate day like home time dorm
Topic #7:
people college friends like just don make fucking amp ve
Topic #8:
just like college feel don school know want year really
Topic #9:
deleted college semester classes class advisor fuck graduate credits program
Topic #10:
class professor just group like work don students professors project


In [None]:
df_college_new.columns

Index(['title', 'selftext', 'url', 'author', 'post', 'text'], dtype='object')

In [None]:
# Copy text and post column from df_new to new dataframe
df_college_text = df_college_new[['text']].copy()

In [None]:
df_college_text

Unnamed: 0,text
0,First Post to r/CollegeRant! This subreddit is...
1,Looking for moderators Considering this is a n...
2,r/CollegeRant Guidelines and Rules The nature ...
3,Can i rant about college mental health counsel...
4,Updated Special User Flairs. We now have speci...
...,...
14736,Desk WHOMEVER decided to look at those tiny a...
14737,Online Job Opportunity Ringle Tutoring for Stu...
14738,4 hour gap between classes [deleted]
14739,Just followed my college music directors insta...


#### Label everything as an 'issue' and then export it to a csv

In [None]:
df_college_text['issue'] = 1

In [None]:
df_college_text.head()

Unnamed: 0,text,issue
0,First Post to r/CollegeRant! This subreddit is...,1
1,Looking for moderators Considering this is a n...,1
2,r/CollegeRant Guidelines and Rules The nature ...,1
3,Can i rant about college mental health counsel...,1
4,Updated Special User Flairs. We now have speci...,1


#### Export the final dataframe to a csv

In [None]:
df_college_text.to_csv('reddit_posts.csv', index=False)

## r/mcgill submissions

The code in this section is designed to explore and preprocess the data sourced from r/mcgill. This is done to ensure the data's consistency, thereby making it suitable for our subsequent analysis.

##### Data Preprocessing Steps

In [None]:
#set confgiuration to display all columns
pd.set_option('display.max_columns', None)

# Path to your .zst file
zst_file_path = 'mcgill_submissions.zst'

# Decompress .zst file
with open(zst_file_path, 'rb') as compressed:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(compressed) as reader:
        decompressed_data = reader.read().decode('utf-8')

# Assuming each line in the decompressed data is a separate JSON object
data = [json.loads(line) for line in decompressed_data.splitlines()]

# Convert the list of dictionaries to a DataFrame
df_mcgill = pd.DataFrame(data)

# Display the first few rows of the DataFrame to verify
df_mcgill.head()

Unnamed: 0,archived,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,brand_safe,can_gild,contest_mode,created_utc,distinguished,domain,edited,gilded,hidden,hide_score,id,is_crosspostable,is_reddit_media_domain,is_self,is_video,link_flair_css_class,link_flair_richtext,link_flair_text,link_flair_text_color,link_flair_type,locked,media,media_embed,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,retrieved_on,rte_mode,score,secure_media,secure_media_embed,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,suggested_sort,thumbnail,thumbnail_height,thumbnail_width,title,url,whitelist_status,downs,num_reports,banned_by,name,likes,clicked,saved,ups,approved_by,selftext_html,created,report_reasons,user_reports,mod_reports,quarantine,from_id,from_kind,from,post_hint,preview,author_cakeday,view_count,approved_at_utc,banned_at_utc,can_mod_post,pinned,mod_note,mod_reason_by,mod_reason_title,crosspost_parent,crosspost_parent_list,subreddit_subscribers,media_metadata,author_flair_template_id,category,content_categories,is_original_content,media_only,post_categories,pwls,removal_reason,visited,wls,link_flair_background_color,previous_visits,link_flair_template_id,author_created_utc,author_fullname,is_meta,gildings,is_robot_indexable,author_patreon_flair,all_awardings,total_awards_received,allow_live_comments,discussion_type,author_premium,awarders,removed_by,removed_by_category,treatment_tags,poll_data,upvote_ratio,is_created_from_ads_ui,retrieved_utc,top_awarded_type,url_overridden_by_dest,gallery_data,is_gallery,call_to_action,collections
0,True,david_a_garcia,,engineering,"[{'e': 'text', 't': 'Electrical Eng '10'}]",Electrical Eng '10,,richtext,True,True,False,1265431507,,self.mcgill,False,0.0,False,False,aypo9,True,False,True,False,,[],,dark,text,False,,{},True,4,0.0,False,all_ads,/r/mcgill/comments/aypo9/is_this_working_yet/,1522891000.0,markdown,1,,{},,True,False,False,mcgill,t5_2rhkw,r/mcgill,public,,self,,,is this working yet?,https://www.reddit.com/r/mcgill/comments/aypo9...,all_ads,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,[deleted],,,,,,,,,,1299562966,,self.mcgill,False,,False,,fzkkf,,,True,,,,,,,,,{},,13,,False,,/r/mcgill/comments/fzkkf/advice_for_u0s/,,,5,,,I'm just finishing up my first year at McGill ...,,,,mcgill,t5_2rhkw,,,,default,,,Advice for U0's?,http://www.reddit.com/r/mcgill/comments/fzkkf/...,,0.0,,,t3_fzkkf,,False,False,5.0,,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",1299563000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,jakob5860,,,,,,,,,,1300316295,,self.mcgill,False,,False,,g5h9n,,,True,,,,,,,,,{},,1,,False,,/r/mcgill/comments/g5h9n/ama_request_sean_turner/,,,2,,,,,,,mcgill,t5_2rhkw,,,,self,,,AMA Request: Sean Turner,http://www.reddit.com/r/mcgill/comments/g5h9n/...,,0.0,,,t3_g5h9n,,False,False,2.0,,,1300320000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,mricon,,,,,,,,,,1301667227,,self.mcgill,False,,False,,gge78,,,True,,,,,,,,,{},,37,,False,,/r/mcgill/comments/gge78/i_work_in_mcgill_cent...,,,15,,,I am a senior IT security analyst at McGill In...,,,,mcgill,t5_2rhkw,,,,self,,,I work in McGill central IT. AMA.,http://www.reddit.com/r/mcgill/comments/gge78/...,,0.0,,,t3_gge78,,False,False,15.0,,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",1301671000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,[deleted],,,,,,,,,,1301628324,,self.mcgill,False,,False,,gg23u,,,True,,,,,,,,,{},,0,,False,,/r/mcgill/comments/gg23u/so_how_was_the_protes...,,,1,,,,,,,mcgill,t5_2rhkw,,,,default,,,So how was the protest today?,http://www.reddit.com/r/mcgill/comments/gg23u/...,,0.0,,,t3_gg23u,,False,False,1.0,,,1301632000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df_mcgill.shape

(60899, 128)

In [None]:
# Checking the range of the dataset
df_mcgill['created_utc'] = pd.to_numeric(df_mcgill['created_utc'], errors='coerce')

# Now you can find the min and max
min_date = pd.to_datetime(df_mcgill['created_utc'].min(), unit='s').isoformat()
max_date = pd.to_datetime(df_mcgill['created_utc'].max(), unit='s').isoformat()

min_date, max_date

('2010-02-06T04:45:07', '2022-12-31T22:51:43')

In [None]:
# get title, selftext, author, created_utc, and url
df_mcgill_new = df_mcgill[['title', 'selftext', 'author', 'created_utc', 'url']]

In [None]:
# Combine 'title' and 'selftext' into one text
df_mcgill_new['text'] = df_mcgill_new['title'] + ' ' + df_mcgill_new['selftext']

# Preprocess text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df_mcgill_new['text'].values.astype('U'))

# Apply LDA
lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(X)

# Print top 5 topics
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10]]))

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

Topic #1:
just like really don feel ve time know people class
Topic #2:
mcgill student montreal know just international year time need does
Topic #3:
course math courses class taking summer taken semester classes thanks
Topic #4:
removed deleted user mcgill help question final econ admission student
Topic #5:
mcgill people ssmu students like student friends know campus club
Topic #6:
course grade email minerva does class know just transcript grades
Topic #7:
mcgill https www com ca health http reddit students insurance
Topic #8:
mcgill year program science major school courses student arts gpa
Topic #9:
amp comp mcgill engineering x200b university concordia software computer cs
Topic #10:
exam campus know mcgill exams does library just deferred open


#### Choose Topic #2 as posts which are not issues so that we can create some training data for the posts which are "NOT ISSUES" (0)

In [None]:
# Transform the documents to topic distribution
topic_distribution = lda.transform(X)

# Get the index of the topic with the highest probability for each document
dominant_topic = np.argmax(topic_distribution, axis=1)

# Create a new column in df2 for the dominant topic
df_mcgill_new['dominant_topic'] = dominant_topic

# Filter df_mcgill_new for the documents where the dominant topic is Topic #2
topic2_posts = df_mcgill_new[df_mcgill_new['dominant_topic'] == 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mcgill_new['dominant_topic'] = dominant_topic


In [None]:
topic2_posts.head()

Unnamed: 0,title,selftext,author,created_utc,url,text,dominant_topic
15,"You like physical chemistry, no?",Does anyone here have a detailed physical chem...,directed_revolver,1305814292,http://www.reddit.com/r/mcgill/comments/hf1o8/...,"You like physical chemistry, no? Does anyone h...",2
27,Course wait lists.,I will be a first year student next year. I wa...,w1ldebeast,1311195621,http://www.reddit.com/r/mcgill/comments/iv6lg/...,Course wait lists. I will be a first year stud...,2
46,Taking 2 courses that are scheduled at the sam...,Hi. I'm going into U2 with a Major in Pharm an...,AmyThaliaGregCalvin,1313863416,http://www.reddit.com/r/mcgill/comments/jozoe/...,Taking 2 courses that are scheduled at the sam...,2
49,Taking a Second year course in my first year.,I was just wondering if anyone else did this a...,antantoon,1314035010,http://www.reddit.com/r/mcgill/comments/jqrwm/...,Taking a Second year course in my first year. ...,2
78,Anyone taking COMP 230 and/or MATH 240?,Those are the two courses I'm taking that I do...,ReMO451,1315262754,http://www.reddit.com/r/mcgill/comments/k5sal/...,Anyone taking COMP 230 and/or MATH 240? Those ...,2


In [None]:
# display full text from column text for first 5 rows
for i in range(5):
    print(topic2_posts['text'].iloc[i])
    print()

You like physical chemistry, no? Does anyone here have a detailed physical chemistry syllabus? Bonus points if you're willing to help me get my hands on some actual course materials. (Specifically: CHEM 223, CHEM 243) 

For the interested: I am a biochem major who is a little disappointed with his understanding of physical chemistry after taking the bio-phys-chem cycle.  I have a summer, and time, and (more than likely foolish) ambition.

Course wait lists. I will be a first year student next year. I was looking to take First Level Chinese as an elective. The problem is that the class is full. There is no one on the wait list so I think that the chances of getting in are pretty good. What I don't know is if I should register for other classes and then drop them if I do get in? 

Taking 2 courses that are scheduled at the same time? And interesting/easy Humanities courses? Hi. I'm going into U2 with a Major in Pharm and a Minor in Neuroscience. In my second semester, I have 2 courses th

In [None]:
# Copy text and post column from df_new to new dataframe
df_mcgill_text = topic2_posts[['text']].copy()

df_mcgill_text['issue'] = 0

#### Export the final dataframe to a csv

In [None]:
df_mcgill_text.to_csv('topic2_posts.csv', index=False)

## Model Building

#### Preprocessing
We have marked the posts from r/CollegeRants as posts with issues (=1) and a selected posts from the r/mcgill as posts with no issues (=0)

In [None]:
import pandas as pd

# read data from reddit_posts.csv and topic2_posts.csv
reddit_posts = pd.read_csv('reddit_posts.csv')
topic2_posts = pd.read_csv('topic2_posts.csv')

# merge the two dataframes
merged = pd.concat([reddit_posts, topic2_posts])
merged.columns

Index(['text', 'issue'], dtype='object')

#### We have 8345 posts which are not issues and 14741 posts with issues.

In [None]:
merged['issue'].value_counts()

## Model Selection

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

In [None]:
# Download NLTK resources (if not already installed)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

In [None]:
# Preprocessing function with lemmatization
def preprocess(text, lemmatize=True):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    if lemmatize:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing to the text
merged['text_processed'] = merged['text'].apply(preprocess)

# Choose whether to lemmatize or not
lemmatize = True  # or False
text_column = 'text_processed' if lemmatize else 'text'

# Create different vectorizations
vectorizers = {
    'count_monogram': CountVectorizer(ngram_range=(1, 1)),
    'count_bigram': CountVectorizer(ngram_range=(2, 2)),
    'tfidf_monogram': TfidfVectorizer(ngram_range=(1, 1)),
    'tfidf_bigram': TfidfVectorizer(ngram_range=(2, 2))
}

# Initialize classifiers
classifiers = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier()
}

# Split the dataset
label = merged['issue']
X_train, X_test, y_train, y_test = train_test_split(merged[text_column], label, test_size=0.2)

# Function to evaluate a model
def evaluate_model(vectorizer, classifier, X_train, y_train, X_test, y_test):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True)
    plt.show()
    
    print(classification_report(y_test, y_pred))
    return cm, classification_report(y_test, y_pred, output_dict=True)

# Evaluate all combinations of vectorizers and classifiers
for vec_name, vectorizer in vectorizers.items():
    for clf_name, classifier in classifiers.items():
        print(f"Evaluating combination: {vec_name} + {clf_name}")
        evaluate_model(vectorizer, classifier, X_train, y_train, X_test, y_test)

#### Best model: Naive Bayes + Monogram Count Vectorizer

In [None]:
vectorizer=CountVectorizer()
spamham_countVectorizer=vectorizer.fit_transform(merged['text'])

print(vectorizer.get_feature_names_out())

In [None]:
spamham_countVectorizer.shape

label=merged['issue']
X=spamham_countVectorizer
y=label

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

NB_classifier=MultinomialNB()
NB_classifier.fit(X_train,y_train)

y_predict_train=NB_classifier.predict(X_train)
y_predict_train

cm=confusion_matrix(y_train,y_predict_train)

sns.heatmap(cm,annot=True)

In [None]:
y_predict_test=NB_classifier.predict(X_test)
print (y_predict_test)

cm=confusion_matrix(y_test,y_predict_test)


sns.heatmap(cm,annot=True)

print(classification_report(y_test,y_predict_test))

## Model Testing

In [None]:
import pandas as pd
%pip install zstandard
import zstandard as zstd
import json

#set confgiuration to display all columns
pd.set_option('display.max_columns', None)

# Path to your .zst file
zst_file_path = 'mcgill_submissions.zst'

# Decompress .zst file
with open(zst_file_path, 'rb') as compressed:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(compressed) as reader:
        decompressed_data = reader.read().decode('utf-8')

# Assuming each line in the decompressed data is a separate JSON object
data = [json.loads(line) for line in decompressed_data.splitlines()]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame to verify
df.head()

In [None]:
# combine title and selftext into one column called text
df['text'] = df['title'] + ' ' + df['selftext']

In [None]:
df_new2 = df[['text']].copy()

In [None]:
df_new2

In [None]:
# Use NB_classifier on the df_new2 dataframe
spamham_countVectorizer=vectorizer.transform(df_new2['text'])
print(spamham_countVectorizer)

y_predict_test=NB_classifier.predict(spamham_countVectorizer)
print (y_predict_test)

In [None]:
df_new2['predicted_issue'] = y_predict_test

In [None]:
df_new2

In [None]:
df_new2['predicted_issue'].value_counts()

## Clustering

In [None]:
# filter out all 1 from predicted issue column
df_new3 = df_new2[df_new2['predicted_issue'] == 1]

In [None]:
df_new3

In [None]:
# check if word mental health is in the text column
df_new4 = df_new3[df_new3['text'].str.contains('mental health|stress|depression|anxiety|mental illness|counseling|support|well-being|coping|self-care|psychiatry|psychology|treatment|medication|coping strategies|peer support|self-esteem|suicidal ideation|trauma', case=False, na=False)]

In [None]:
df_new4.shape

In [None]:
df_new4.head(10)

In [None]:
# Randomly select 5000 rows from df_new3 (general issues)
df_new5 = df_new3.sample(n=5000)

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')  # Download necessary NLTK data

# Initialize stemmer
stemmer = PorterStemmer()

# Function to tokenize and stem text
def preprocess_text(text):
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed)

# Preprocess the synopses
df_new5['processed_synopses'] = df_new5['text'].apply(preprocess_text)

In [None]:
# Preprocess the synopses
df_new4['processed_synopses'] = df_new4['text'].apply(preprocess_text)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(df_new5['processed_synopses'])
tfidf_matrix2 = tfidf_vectorizer.fit_transform(df_new4['processed_synopses'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Define the range of k values to explore
k_values = range(3, 16)

# Initialize variables to store the best score and corresponding k value
best_score = -1
best_k = 3
silhouette_scores = []
wcss_scores = []

# Loop over each k value
for k in k_values:
    # Create and fit the KMeans model
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    
    # Calculate the silhouette score
    sil_score = silhouette_score(tfidf_matrix, clusters)
    silhouette_scores.append(sil_score)
    
    # Calculate and append WCSS (Inertia)
    wcss = km.inertia_
    wcss_scores.append(wcss)

    # Print the silhouette and WCSS score for the current k value
    print(f'k={k}: Silhouette Score={sil_score}, WCSS={wcss}')
    
    # Update the best_score and best_k if the current score is better
    if sil_score > best_score:
        best_score = sil_score
        best_k = k

# Print the best k value and its silhouette score
print(f'Best Silhouette Score is {best_score} for k={best_k}')

# Fit the KMeans model with the best number of clusters
km_1 = KMeans(n_clusters=10, random_state=42, n_init=10)
km_1.fit(tfidf_matrix)
clusters = km_1.labels_.tolist()

# Fit the KMeans model with the best number of clusters (mental health related)
km_2 = KMeans(n_clusters=10, random_state=42, n_init=10)
km_2.fit(tfidf_matrix2)
clusters2 = km_2.labels_.tolist()

# Plotting the elbow plot for silhouette scores
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(list(k_values), silhouette_scores, marker='o')
plt.title('Elbow Plot for Silhouette Score')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid(True)

# Plotting the elbow plot for WCSS scores
plt.subplot(1, 2, 2)
plt.plot(list(k_values), wcss_scores, marker='o', color='red')
plt.title('Elbow Plot for WCSS')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

# Perform MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(1 - similarity_matrix)  # Using 1 - similarity_matrix for distance

# Extract the coordinates
xs, ys = pos[:, 0], pos[:, 1]

# Create a scatter plot of the projection
plt.figure(figsize=(12, 8))
scatter = plt.scatter(xs, ys, c=clusters)  # Color by cluster assignments

# Create labels for the clusters
labels = np.unique(clusters)
for i in labels:
    plt.scatter([], [], label='Cluster ' + str(i))

plt.xlabel('MDS Dimension 1')
plt.ylabel('MDS Dimension 2')
plt.title('MDS Visualization of Document Clusters')
plt.legend(title='Clusters')
plt.show()

In [None]:
def get_top_features_cluster(tfidf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label)  # indices for each cluster
        x_means = np.mean(tfidf_array[id_temp], axis = 0)  # mean tf-idf value for each feature in the cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats]  # indices with top features
        features = tfidf_vectorizer.get_feature_names_out()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(df)
    return dfs

top_words_per_cluster = get_top_features_cluster(tfidf_matrix.toarray(), km_1.labels_, 10)  # Adjust 10 to get more or fewer words
top_words_per_cluster2 = get_top_features_cluster(tfidf_matrix2.toarray(), km_2.labels_, 10)

In [None]:
cluster_names = []
for num, df in enumerate(top_words_per_cluster):
    print(f"Cluster {num} words:")
    print(df)
    cluster_name = '-'.join(df['features'].values[:10])  # Name cluster based on top 3 words
    cluster_names.append(cluster_name)
    print(f"Named: {cluster_name}\n")

# Optionally, you can add these names to your original DataFrame
df_new5['Cluster_Name'] = [cluster_names[i] for i in clusters]

In [None]:
cluster_names = []
for num, df in enumerate(top_words_per_cluster2):
    print(f"Cluster {num} words:")
    print(df)
    cluster_name = '-'.join(df['features'].values[:10])  # Name cluster based on top 3 words
    cluster_names.append(cluster_name)
    print(f"Named: {cluster_name}\n")

# Optionally, you can add these names to your original DataFrame
df_new4['Cluster_Name'] = [cluster_names[i] for i in clusters2]

In [None]:
df_new5.head()

In [None]:
df_new4.head()

In [None]:
# find number of posts in each cluster
df_new5['Cluster_Name'].value_counts()

In [None]:
# find number of posts in each cluster (related to mental health)
df_new4['Cluster_Name'].value_counts()

# Rate My Professors

In [None]:
prof = pd.read_csv('all_professor_comments.csv')

In [None]:
prof.head()

Unnamed: 0,Comment,Department,School,ProfessorName
0,Had him long ago. He was late to every class....,Economics department,McGill University,George Grantham Grantham
1,"my favourite prof at mcgill! very helpful, re...",Economics department,McGill University,George Grantham Grantham
2,Yawn.... attended class a few times but learne...,Economics department,McGill University,George Grantham Grantham
3,a very poor prof to say the least; can be undu...,Economics department,McGill University,George Grantham Grantham
4,I've taken his class three times now. Best cl...,Economics department,McGill University,George Grantham Grantham


In [None]:
prof['Comment'].isnull().sum()

In [None]:
# Dorp null values from Comment column
prof = prof.dropna(subset=['Comment'])

In [None]:
prof['Comment'].isnull().sum()

In [None]:
# Use NB_classifier on the df_new2 dataframe
spamham_countVectorizer=vectorizer.transform(prof['Comment'])
print(spamham_countVectorizer)

y_predict_test=NB_classifier.predict(spamham_countVectorizer)
print (y_predict_test)

prof['predicted_issue'] = y_predict_test

In [None]:
prof.head()

In [None]:
prof['predicted_issue'].value_counts()

In [None]:
prof_new = prof[prof['predicted_issue'] == 1]

In [None]:
prof_new['Department'].value_counts()

In [None]:
# check if word mental health is in the text column
up = prof_new[prof_new['Comment'].str.contains('mental health|stress|depression|anxiety|mental illness|counseling|support|well-being|coping|self-care|psychiatry|psychology|treatment|medication|coping strategies|peer support|self-esteem|suicidal ideation|trauma', case=False, na=False)]

In [None]:
up['Department'].value_counts()

In [None]:
import matplotlib.pyplot as plt

department_counts = up['Department'].value_counts()

# Group departments with counts less than 10 into an "Other" category
other_count = department_counts[department_counts < 8].sum()
department_counts = department_counts[department_counts >= 8]
department_counts['Other'] = other_count  # Add the "Other" category

# Calculate the total number of counts
total_counts = department_counts.sum()

colors=[
    '#FF6347',  # Tomato
    '#4682B4',  # Steel Blue
    '#32CD32',  # Lime Green
    '#FFD700',  # Gold
    '#6A5ACD',  # Slate Blue
    '#FF69B4',  # Hot Pink
    '#8A2BE2',  # Blue Violet
    '#20B2AA',  # Light Sea Green
    '#FFA07A',  # Light Salmon
    '#DAA520',  # Goldenrod
    '#7FFF00',  # Chartreuse
    '#40E0D0'   # Turquoise
]

# Create the donut chart with a larger figure size
plt.figure(figsize=(10, 8))  # Increase figure size
ax = plt.subplot(111)

ax.pie(department_counts, startangle=90, counterclock=False, wedgeprops=dict(width=0.4), autopct='%1.1f%%', colors=colors)

# Draw a circle at the center to create a donut hole
centre_circle = plt.Circle((0,0),0.70, fc='white')
ax.add_artist(centre_circle)

# Add the total counts in the middle of the donut chart
plt.text(0, 0, f'Total\n{total_counts}', horizontalalignment='center', verticalalignment='center', fontsize=18)

# Add a legend to the chart, which will help identify each slice without clutter
ax.legend(department_counts.index, loc='upper left', bbox_to_anchor=(1, 1))

# Add title
plt.title('Department Distribution of Mental Health Related Comments')

# Display the plot with a tight layout to ensure everything fits without overlap
plt.tight_layout()
plt.show()

In [None]:
up.head()

In [None]:
# Show full Comment column
pd.set_option('display.max_colwidth', None)

up.head()

## Department Clustering

In [None]:
prof_new.head()

In [None]:
prof_eng = prof_new[prof_new['Department'] == 'Engineering department']

In [None]:
prof_eng.shape

In [None]:
# Change column name from 'Comment' to 'text'
prof_eng = prof_eng.rename(columns={'Comment': 'text'})

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')  # Download necessary NLTK data

# Initialize stemmer
stemmer = PorterStemmer()

# Function to tokenize and stem text
def preprocess_text(text):
    tokens = word_tokenize(text)
    stemmed = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed)

# Preprocess the synopses
prof_eng['processed_synopses'] = prof_eng['text'].apply(preprocess_text)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(prof_eng['processed_synopses'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5  
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [None]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

# Perform MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(1 - similarity_matrix)  # Using 1 - similarity_matrix for distance

# Extract the coordinates
xs, ys = pos[:, 0], pos[:, 1]

# Create a scatter plot of the projection
plt.figure(figsize=(12, 8))
scatter = plt.scatter(xs, ys, c=clusters)  # Color by cluster assignments

# Create labels for the clusters
labels = np.unique(clusters)
for i in labels:
    plt.scatter([], [], label='Cluster ' + str(i))

plt.xlabel('MDS Dimension 1')
plt.ylabel('MDS Dimension 2')
plt.title('MDS Visualization of Document Clusters')
plt.legend(title='Clusters')
plt.show()

In [None]:
def get_top_features_cluster(tfidf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label)  # indices for each cluster
        x_means = np.mean(tfidf_array[id_temp], axis = 0)  # mean tf-idf value for each feature in the cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats]  # indices with top features
        features = tfidf_vectorizer.get_feature_names_out()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(df)
    return dfs

top_words_per_cluster = get_top_features_cluster(tfidf_matrix.toarray(), km.labels_, 10)  # Adjust 10 to get more or fewer words

In [None]:
cluster_names = []
for num, df in enumerate(top_words_per_cluster):
    print(f"Cluster {num} words:")
    print(df)
    cluster_name = '-'.join(df['features'].values[:10])  # Name cluster based on top 3 words
    cluster_names.append(cluster_name)
    print(f"Named: {cluster_name}\n")

# Optionally, you can add these names to your original DataFrame
prof_eng['Cluster_Name'] = [cluster_names[i] for i in clusters]

In [None]:
# find number of posts in each cluster
prof_eng['Cluster_Name'].value_counts()

## Sentiment Analysis

In [None]:
prof.head()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download the vader_lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()

# Function to get the sentiment score of text
def get_sentiment_score(text):
    return sid.polarity_scores(text)

# Apply the function to the Comment column
prof['sentiment'] = prof['Comment'].apply(get_sentiment_score)

# Display the DataFrame
print(prof.head())

In [None]:
prof.head()

In [None]:
# Function to classify sentiment based on compound score
def classify_sentiment(score):
    if score['compound'] > 0:
        return 'Positive'
    else:
        return 'Negative'

# Apply the function to the sentiment column
prof['sentiment_class'] = prof['sentiment'].apply(classify_sentiment)

# Display the DataFrame
print(prof.head())

In [None]:
prof.head()

In [None]:
# Find the top 5 departments based on the count of comments
top_departments = prof['Department'].value_counts().nlargest(10).index

# Filter the dataframe to include only rows where the 'Department' is one of the top 5 departments
prof_top_departments = prof[prof['Department'].isin(top_departments)]

# Now you can perform the same analysis as before on this filtered dataframe

# Filter the dataframe to include only rows where the sentiment_class is 'Negative'
negative_comments = prof_top_departments[prof_top_departments['sentiment_class'] == 'Negative']

# Count the number of negative comments in each department
negative_counts = negative_comments['Department'].value_counts()

# Count the total number of comments in each department
total_counts = prof_top_departments['Department'].value_counts()

# Calculate the ratio of negative to total comments
negative_ratio = negative_counts / total_counts

# Sort the departments by the negative ratio in descending order
ranking = negative_ratio.sort_values(ascending=False)

print(ranking)

# Importing matplotlib for plotting
import matplotlib.pyplot as plt

# Plotting the ranking as a bar chart
plt.figure(figsize=(10, 6))
ranking.plot(kind='bar', color='skyblue')
plt.title('Negative Comment Ratio by Department')
plt.xlabel('Department')
plt.ylabel('Ratio of Negative Comments')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()  # Adjusts the plot to ensure everything fits without overlapping
plt.show()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=893dd5c2-ff0a-49e9-afe5-f4b2b61fffff' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>