In [None]:
"""
GCP and Colab:This notebook has all the code for analysis of the sentiment scores and performing temporal analysis, topic modeling and word cloud.
This uses PySpark for processing the scores.
"""

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null                                     # install java
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz     # download spark
!tar xf spark-3.1.1-bin-hadoop3.2.tgz                                                       # untar spark                   
!pip install -q findspark                                                                   # install findspark                                       
!pip install emoji                                                                          # install emoji

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"         # set java home        
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"       # set spark home

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# import required libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# import emoji library
def LDA(preprocessed_posts):
    print(f"Preprocessed {len(preprocessed_posts)} records..")

    # Create a document-term matrix using CountVectorizer
    vectorizer = CountVectorizer()

    # doc_term_matrix is a sparse matrix
    doc_term_matrix = vectorizer.fit_transform(preprocessed_posts) 

    print("LDA Checkpoint 1")
    # Define the number of topics to discover
    num_topics = 10

    # Apply LDA topic modeling
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42) 
    lda_model.fit(doc_term_matrix) 

    print("LDA Checkpoint 2")
    # Extract the most common topics
    feature_names = vectorizer.get_feature_names_out() 
    most_common_topics = []

    for topic_idx, topic in enumerate(lda_model.components_):   
        top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]    
        most_common_topics.append((topic_idx, top_words))   

    # Print the most common topics
    for topic in most_common_topics:
        print(topic)

In [None]:
# import required libraries
import findspark
findspark.init()
from pyspark.sql import SparkSession
import datetime
from pyspark import AccumulatorParam
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import json

# create spark session
spark = SparkSession.builder \
    .master("local[*]") \
    .getOrCreate()  

# class to accumulate dictionaries
class DictParam(AccumulatorParam):
    def zero(self, value = dict()):
        return value
	
    def addInPlace(self, dict1 ,dict2):
        for k, v in dict2.items():
            if k in dict1:
                dict1[k] = dict1[k] + v
            else:
                dict1[k] = v
        return dict1

# to store the accumulated values
month_acc = spark.sparkContext.accumulator(dict(), DictParam()) #Divide by total length
count_acc = spark.sparkContext.accumulator(dict(), DictParam()) 
word_acc = spark.sparkContext.accumulator(dict(), DictParam())

# map and aggregate
def mapAndAggregate(rec):
    global month_acc
    global count_acc
    rec = rec.split("DELIM")
    try:
        date = datetime.datetime.fromtimestamp(float(rec[1]))
        k = date.month#.strftime("%Y-%m")
        month_acc += { k: float(rec[3]) }
        count_acc += { k: 1 }
    except:
        print("ERROR", rec)
    return rec

# read data
data = spark.sparkContext.textFile("Data/University-Scores/combined-scores").map(mapAndAggregate) 
length = data.count()

acad_year = month_acc.value 
post_count = count_acc.value    
with open("month-acc.json", "w") as fi: 
    json.dump(acad_year, fi)
with open("count-acc.json", "w") as fi:
    json.dump(post_count, fi)

for month in month_acc.value.keys():
    acad_year[month] /= post_count[month]

acad_year = sorted(acad_year.items())
months, values = zip(*acad_year)

# Plot the monthly values
plt.plot(months, values)
plt.xlabel('Year')
plt.ylabel('Value')
plt.title('Monthly Values')
plt.xticks(range(1, 13))  # Set x-axis ticks for each month
# plt.xticks(range(2008, 2024))  # Set x-axis ticks for each month
plt.show()

print("Checkpoint1: ", length)
stressData = data.filter(lambda x: float(x[3]) >= 0.7)
print("Checkpoint2, Stress data: ", stressData.count())

# Topic modeling
TMdata = stressData.map(lambda x: x[2]).collect()
print("Collected stress data, calling LDA")
LDA(TMdata)

# Generate word frequencies
def collect_word_freq(rec):
    global word_acc
    temp = {}
    for word in rec[2].split():
        if word in temp:
            temp[word] += 1
        else:
            temp[word] = 1
    word_acc += temp

# WordCloud
stressData.foreach(collect_word_freq)
print("Word frequencies done")
word_freq = word_acc.value
with open("word-acc.json", "w") as fi:
    json.dump(word_freq, fi)

# # Create the word cloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

# # Display the word cloud using matplotlib
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

# stop spark session
spark.stop()


In [None]:
# import required libraries
import json
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

# Load the English stop words from NLTK
nltk.download('stopwords')
nltk_stop_words = stopwords.words('english')

from stop_words import get_stop_words

# Get the English stop words from the package
py_stop_words = get_stop_words('en')

# Define custom stop words
custom_stop_words = ["anyone", "get", "really", "im", "going", "sure", "y’all", "also", "keep", "one", "like", "else", "i’m", "it’s", "got" , "say", "go" , "come", "dont", "don’t", "take", "day", "thought", "two", "getting", "you’re", "since", "even", "cant", "taken", "know", "someone", "i’ve", "ive", "well", "making", "thing", "didn’t", "haven’t", "want", "yet", "wanted", "make", "already", "coming"]

# Combine the lists of stop words
combined = set(nltk_stop_words + py_stop_words + custom_stop_words)

with open("word-acc.json", "r") as fi:
    word_freq = json.load(fi)

keys_to_delete = [key for key in word_freq.keys() if key in combined or len(key) <= 2]  # Delete stop words and words with length <= 2

for key in keys_to_delete:
    del word_freq[key]

# Create the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
"""
Fig 3: Analysis of stress scores of college Reddit communities over the past 10 years
"""
# Create area chart using Plotly
import json
import plotly.express as px
import plotly.graph_objects as go

with open('/content/month-acc.json') as json_file:
    data = json.load(json_file)
# Convert the data dictionary to a DataFrame
df = pd.DataFrame.from_dict(data, orient='index', columns=['Value'])
df = df.sort_index()
# Reset index and rename columns
df = df.reset_index().rename(columns={'index': 'Year'})
df['Year'] = df['Year'].astype(int)

highlight_mask = (df['Year'] >= 2020) & (df['Year'] <= 2022)

fig = go.Figure(data=[
    go.Scatter(x=df['Year'], 
               y=df['Value'], 
               fill='tozeroy', 
               mode='none',
              #  fillcolor='rgb(204, 204, 225)',
               fillcolor='rgb(36, 142, 124)',
               opacity=0.01
               )
])


# Highlight the years 2019-2022 as "covid" with red color
fig.update_traces(
    marker=dict(color='red'),
    # selector=dict(x=df['Year'].between(2019, 2022))
    selector = dict({'Year':2010})
)

fig.update_layout(
    title='Trend Analysis',
    xaxis=dict(
        title='Year',
        tickmode='array',
        tickvals=df['Year'],
        ticktext=df['Year']
    ),
    yaxis=dict(title='Value'),
    showlegend=False
)
fig.update_layout(yaxis_range=[0.2,0.5])
fig.update_layout(xaxis_range=[2012,2022])

fig.add_trace(go.Scatter(
    x=df['Year'][highlight_mask],
    y=df['Value'][highlight_mask],
    mode='lines',
    fill='tozeroy',
    fillpattern=dict(fgcolor='rgb(153, 153, 255)', fillmode='replace', shape="x"),
    # fillcolor='rgb(255, 255, 153)',
    # fillcolor='rgb(153, 153, 225)',
    line=dict(color='rgb(153, 153, 255)'),
    name='Covid Years',
    opacity=.5
))

fig.update_layout(
    title='Analysis of stress scores of college reddit communities over the past 10 years',
    yaxis=dict(title='Average Stress scores'),
    showlegend=False
)

fig.show()

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Load the data from University Stress Scores CSV file into a Pandas DataFrame
data = pd.read_csv('/content/university_stress_scores.csv', names=['subreddit','scores'])

# data.drop(data[data['subreddit'] == 'colum'].index, inplace = True)
data['scores'] = data['scores'] - data['scores'].mean()
# Extract the university subreddit names and stress scores from the DataFrame
university_subreddits = data['subreddit']
stress_scores = data['scores']

# Create a color scale using an increasing color gradient
color_scale = px.colors.sequential.Reds

# Create the bar graph using Plotly
fig = go.Figure(data=[go.Bar(x=university_subreddits, 
                             y=stress_scores
                             )])

fig.update_layout(xaxis=dict(
                  title='Xaxis Name',
                  tickmode='linear'),
                  xaxis_title='University Subreddit', 
                  yaxis_title='Deviation from Avg. Stress Scores')

# Set the color scale for the bar graph
fig.update_traces(marker=dict(colorscale=color_scale))
# Set the x-axis label orientation
fig.update_layout(xaxis_tickangle=-45)
# Set category order for x-axis to "category ascending"
fig.update_xaxes(categoryorder='category ascending')

# Set the category array to display all categories on the x-axis
fig.update_xaxes(categoryarray=university_subreddits)
# fig.update_layout(xaxis_range=[0,0.5])
fig.show()

In [None]:
"""
Fig 4: Temporal analysis over an academic year
"""
import plotly.express as px
import calendar

# Load the data from JSON file into a pandas DataFrame
data = pd.read_json('/content/acad-year-acc.json', orient='index')
data.reset_index(inplace=True)
data.columns = ['month', 'score']
data = data.sort_values('month')

data['month'] = data['month'].apply(lambda x: calendar.month_abbr[x])
# Sort the data by month in ascending order
# Create the line chart using Plotly
fig = px.line(data, x='month', y='score')

# Customize the chart layout
fig.update_layout(xaxis_title='Months',
                  yaxis_title='Average Stess Score')

# Show the chart

fig.show()

In [None]:
import json

with open('/content/month-acc.json') as json_file:
    data = json.load(json_file)
# Convert the data dictionary to a DataFrame
df = pd.DataFrame.from_dict(data, orient='index', columns=['Value'])
df = df.sort_index()
# Reset index and rename columns
df = df.reset_index().rename(columns={'index': 'Year'})
df['Year'] = df['Year'].astype(int)

highlight_mask = (df['Year'] >= 2020) & (df['Year'] <= 2022)

fig = go.Figure(data=[
    go.Scatter(x=df['Year'], 
               y=df['Value'], 
               fill='tozeroy', 
               mode='none',
               fillcolor='rgb(36, 142, 124)',
               opacity=0.01
               )
])


# Highlight the years 2019-2022 as "covid"
fig.update_traces(
    marker=dict(color='red'),
    # selector=dict(x=df['Year'].between(2019, 2022))
    selector = dict({'Year':2010})
)

fig.update_layout(
    title='Trend Analysis',
    xaxis=dict(
        title='Year',
        tickmode='array',
        tickvals=df['Year'],
        ticktext=df['Year']
    ),
    yaxis=dict(title='Value'),
    showlegend=False
)
fig.update_layout(yaxis_range=[0.2,0.5])
fig.update_layout(xaxis_range=[2012,2022])

fig.add_trace(go.Scatter(
    x=df['Year'][highlight_mask],
    y=df['Value'][highlight_mask],
    mode='lines',
    fill='tozeroy',
    fillpattern=dict(fgcolor='rgb(153, 153, 255)', fillmode='replace', shape="x"),
    line=dict(color='rgb(153, 153, 255)'),
    name='Covid Years',
    opacity=.5
))

fig.update_layout(
    title='Analysis of stress scores of college reddit communities over the past 10 years',
    yaxis=dict(title='Average Stress scores'),
    showlegend=False
)

fig.show()