In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
!pip install textblob
from textblob import TextBlob



In [3]:
# import Beautiful Soup, NumPy and Pandas, etc
import bs4 as bs
import numpy as np
import pandas as pd
import re
import hashlib
 
# download NLTK classifiers - these are cached locally on your machine
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /opt/conda/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /opt/conda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /opt/conda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
eng_stopwords = set(stopwords.words('english'))

## NLP Code to clean texts, and lemmatize prior to finding polarity scores
## taken from HW3 Solution Video
def reviewcleaner(review, lemmatize = True, stem = False):
    if lemmatize == True and stem == True:
        raise RuntimeError("May not pass both as true")
    #Remove HTML Tags
    review = bs.BeautifulSoup(review).text
    
    #use regex to find emoticons
    emoticons = re.findall(' (?::|;|=)(?:-)?(?:\)|\(|D|P)',review)
    
    #Remove punctuation
    review = re.sub9("[^a-zA-Z]", ' ', review)
    
    #Tokenize into words (all lower case)
    review = review.lower().split()
    
    #remove stopwords, lemmatize, stem
    clean_review = []
    for word in review: 
        if word not in eng_stopwords:
            if lemmatize is True:
                word = wnl.lemmatize(word)
            elif stem is True:
                word = ps.stem(word)
            clean_review.append(word)
            
    #join the review to one sentence
    review_processed = ' '.join(clean_review + emoticons)

In [5]:
#Function to deal with missing values in text of social media posts
def fillnaf (tbl):
    tbl1 = tbl["Body"].fillna(tbl["Title"])
    tbl["Body"] = tbl1
    return tbl

## Calculate Polarities per University

In [6]:
#Load in Berkeley Data (pre-pandemic)
berk = pd.read_csv('berkprepandemic.csv')
berk = fillnaf(berk)
#berk = review_cleaner(berk['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,6760):
    score = TextBlob(berk["Body"][i]).polarity
    sub = TextBlob(berk['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
berk["Polarity"] = polscores
berk['Subjectivity'] = subscores
berk['University'] = 'UC Berkeley'

In [9]:
#Load in UCLA Data (pre-pandemic)
ucla = pd.read_csv('uclaprepandemic.csv')
ucla = fillnaf(ucla)
#ucla = review_cleaner(ucla['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,7892):
    score = TextBlob(ucla["Body"][i]).polarity
    sub = TextBlob(ucla['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
ucla["Polarity"] = polscores
ucla['Subjectivity'] = subscores
ucla['University'] = 'UCLA'

In [12]:
#Load in Cornell Data (pre-pandemic)
cornell = pd.read_csv('cornellprepandemic.csv')
cornell = fillnaf(cornell)
#cornell = review_cleaner(cornell['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,5973):
    score = TextBlob(cornell["Body"][i]).polarity
    sub = TextBlob(cornell['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
cornell["Polarity"] = polscores
cornell['Subjectivity'] = subscores
cornell['University'] = 'Cornell'

In [13]:
#Load in UC San Diego Data (pre-pandemic)
ucsd = pd.read_csv('ucsdprepandemic.csv')
ucsd = fillnaf(ucsd)
#ucsd = review_cleaner(ucsd['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,7843):
    score = TextBlob(ucsd["Body"][i]).polarity
    sub = TextBlob(ucsd['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
ucsd["Polarity"] = polscores
ucsd['Subjectivity'] = subscores
ucsd['University'] = 'UC San Diego'

In [14]:
#Load in UC Santa Barbara Data (pre-pandemic)
ucsb = pd.read_csv('ucsbprepandemic.csv')
ucsb = fillnaf(ucsb)
#ucsb = review_cleaner(ucsd['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,6007):
    score = TextBlob(ucsb["Body"][i]).polarity
    sub = TextBlob(ucsb['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
ucsb["Polarity"] = polscores
ucsb['Subjectivity'] = subscores
ucsb['University'] = 'UC Santa Barbara'

In [15]:
#Load in UC Irvine Data (pre-pandemic)
irvine = pd.read_csv('irvineprepandemic.csv')
irvine = fillnaf(irvine)
#irvine = review_cleaner(irvine['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,7588):
    score = TextBlob(irvine["Body"][i]).polarity
    sub = TextBlob(irvine['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
irvine["Polarity"] = polscores
irvine['Subjectivity'] = subscores
irvine['University'] = 'UC Irvine'

In [16]:
#Load in UC Santa Cruz Data (pre-pandemic)
ucsc = pd.read_csv('ucscprepandemic.csv')
ucsc = fillnaf(ucsc)
#ucsc = review_cleaner(ucsc['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,8722):
    score = TextBlob(ucsc["Body"][i]).polarity
    sub = TextBlob(ucsc['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
ucsc["Polarity"] = polscores
ucsc['Subjectivity'] = subscores
ucsc['University'] = 'UC Santa Cruz'

In [17]:
#Load in UC Davis Data (pre-pandemic)
davis = pd.read_csv('ucdprepandemic.csv')
davis = fillnaf(davis)
#davis = review_cleaner(davis['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,4528):
    score = TextBlob(davis["Body"][i]).polarity
    sub = TextBlob(davis['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
davis["Polarity"] = polscores
davis['Subjectivity'] = subscores
davis['University'] = 'UC Davis'

In [18]:
#Load in NYU Data (pre-pandemic)
nyu = pd.read_csv('nyuprepandemic.csv')
nyu = fillnaf(nyu)
#nyu = review_cleaner(nyu['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,2168):
    score = TextBlob(nyu["Body"][i]).polarity
    sub = TextBlob(nyu['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
nyu["Polarity"] = polscores
nyu['Subjectivity'] = subscores
nyu['University'] = 'NYU'

In [19]:
#Load in UC Riverside Data (pre-pandemic)
riverside = pd.read_csv('riversideprepandemic.csv')
riverside = fillnaf(riverside)
#riverside = review_cleaner(riverside['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,3694):
    score = TextBlob(riverside["Body"][i]).polarity
    sub = TextBlob(riverside['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
riverside["Polarity"] = polscores
riverside['Subjectivity'] = subscores
riverside['University'] = 'UC Riverside'

In [20]:
#Load in Stanford Data (pre-pandemic)
stanford = pd.read_csv('stanfordprepandemic.csv')
stanford = fillnaf(stanford)
#stanford = review_cleaner(stanford['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,1258):
    score = TextBlob(stanford["Body"][i]).polarity
    sub = TextBlob(stanford['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
stanford["Polarity"] = polscores
stanford['Subjectivity'] = subscores
stanford['University'] = 'Stanford'

In [21]:
#Load in Harvard Data (pre-pandemic)
harvard = pd.read_csv('harvardprepandemic.csv')
harvard = fillnaf(harvard)
#harvard = review_cleaner(harvard['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,827):
    score = TextBlob(harvard["Body"][i]).polarity
    sub = TextBlob(harvard['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
harvard["Polarity"] = polscores
harvard['Subjectivity'] = subscores
harvard['University'] = 'Harvard'

In [22]:
#Load in MIT Data (pre-pandemic)
mit = pd.read_csv('mitprepandemic.csv')
mit = fillnaf(mit)
#mit = review_cleaner(mit['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,572):
    score = TextBlob(mit["Body"][i]).polarity
    sub = TextBlob(mit['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
mit["Polarity"] = polscores
mit['Subjectivity'] = subscores
mit['University'] = 'MIT'

In [23]:
#Load in UC Merced Data (pre-pandemic)
merced = pd.read_csv('ucmprepandemic.csv')
merced = fillnaf(merced)
#merced = review_cleaner(merced['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,100):
    score = TextBlob(merced["Body"][i]).polarity
    sub = TextBlob(merced['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
merced["Polarity"] = polscores
merced['Subjectivity'] = subscores
merced['University'] = 'UC Merced'

In [24]:
#Load in Yale Data (pre-pandemic)
yale = pd.read_csv('yaleprepandemic.csv')
yale = fillnaf(yale)
#yale = review_cleaner(yale['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,379):
    score = TextBlob(yale["Body"][i]).polarity
    sub = TextBlob(yale['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
yale["Polarity"] = polscores
yale['Subjectivity'] = subscores
yale['University'] = 'Yale'

In [25]:
#Load in Princeton Data (pre-pandemic)
princeton = pd.read_csv('princetonprepandemic.csv')
princeton = fillnaf(princeton)
#princeton = review_cleaner(princeton['Body'], True, False)

#Calculates polarity scores for each post
polscores = []
subscores = []
for i in range(0,213):
    score = TextBlob(princeton["Body"][i]).polarity
    sub = TextBlob(princeton['Body'][i]).subjectivity
    polscores.append(score)
    subscores.append(sub)
princeton["Polarity"] = polscores
princeton['Subjectivity'] = subscores
princeton['University'] = 'Princeton'

## Combining All Frames to One

In [26]:
#Combining all dataframes to one
frames = [berk, ucla, ucsd, merced, ucsc, ucsb, riverside, davis, irvine, stanford, mit, yale, princeton, cornell, harvard, nyu]
alluniversitypolarity = pd.concat(frames, ignore_index=True)

In [27]:
#Drop Unnecessary Columns and Convert Dataframe to Csv
alluniversitypolarity.drop(columns=['Post ID', 'Url', 'Author','Permalink','Flair'], inplace = True)
alluniversitypolarity.drop(columns = ['Title','Score'], inplace=True)
alluniversitypolarity.drop(columns = ['Body'], inplace=True)
alluniversitypolarity.to_csv('prepandemicpolarity.csv')