In [1]:
import os

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import wordcloud
import nltk
import string
import textblob

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

In [6]:
#Reading in the data
corpus = pd.read_csv('corpus.csv', header=None, names=['text'])

In [7]:
#Splitting message into individual tokens
corpus['tokens'] = corpus['text'].str.split(' ')

In [8]:
#Appending every individual word in the corpus to a list
words = []
for i in range(corpus.shape[0]):
    message_length = len(corpus['tokens'].iloc[i])
    for j in range(message_length):
        words.append(corpus['tokens'].iloc[i][j])

In [9]:
#Removing capitalization and punctuation from the list of all individual words
cleanwords = []
for word in words:
    word = word.lower()
    word = word.translate(str.maketrans('', '', string.punctuation))
    cleanwords.append(word)

In [10]:
#Message length in tokens
corpus['length'] = corpus['tokens'].apply(len)

In [11]:
#Message length in characters
corpus['charlength'] = corpus['text'].apply(len)

In [12]:
#Functions to get textblob subjectivity and polarity for each message

def get_subjectivity(text):
    return textblob.TextBlob(text).sentiment.subjectivity

def get_polarity(text):
    return textblob.TextBlob(text).sentiment.polarity

In [13]:
corpus['subjectivity'] = corpus['text'].apply(get_subjectivity)
corpus['polarity'] = corpus['text'].apply(get_polarity)

In [14]:
#Using polarity value to determine message sentiment judgment

def get_analysis(score):
    if score < 0:
        return 'negative'
    elif score == 0:
        return 'neutral'
    else:
        return 'positive'

In [15]:
corpus['analysis'] = corpus['polarity'].apply(get_analysis)

In [16]:
#Positive, negative, neutral messages
pos = corpus[corpus['analysis'] == 'positive']
neg = corpus[corpus['analysis'] == 'negative']
neutral = corpus[corpus['analysis'] == 'neutral']

In [18]:
#Percentages for each category
posp = round((pos.shape[0]/4000)*100, 1)
negp = round((neg.shape[0]/4000)*100, 1)
neutralp = round((neutral.shape[0]/4000)*100, 1)

percentages = f"positive: {posp}%, negative: {negp}%, neutral: {neutralp}%"
print(percentages)

positive: 20.5%, negative: 9.8%, neutral: 69.7%


In [19]:
pos.describe()

Unnamed: 0,length,charlength,subjectivity,polarity
count,822.0,822.0,822.0,822.0
mean,4.246959,19.287105,0.592239,0.4731731
std,2.778214,12.843774,0.246484,0.2460475
min,1.0,2.0,0.1,5.5511150000000004e-17
25%,2.0,9.0,0.45,0.25
50%,4.0,17.0,0.6,0.5
75%,6.0,27.0,0.7,0.6875
max,14.0,55.0,1.0,1.0


In [20]:
neg.describe()

Unnamed: 0,length,charlength,subjectivity,polarity
count,391.0,391.0,391.0,391.0
mean,4.872123,22.143223,0.623429,-0.372415
std,2.729519,12.591815,0.298433,0.234659
min,1.0,2.0,0.0,-1.0
25%,3.0,13.0,0.4,-0.5
50%,5.0,21.0,0.6,-0.333333
75%,6.0,29.0,1.0,-0.166667
max,15.0,59.0,1.0,-0.00625


In [21]:
neutral.describe()

Unnamed: 0,length,charlength,subjectivity,polarity
count,2787.0,2787.0,2787.0,2787.0
mean,3.374596,14.958378,0.017086,0.0
std,2.330463,11.063725,0.106189,0.0
min,1.0,1.0,0.0,0.0
25%,1.0,6.0,0.0,0.0
50%,3.0,12.0,0.0,0.0
75%,5.0,21.0,0.0,0.0
max,16.0,64.0,1.0,0.0


In [22]:
#Generating a word cloud which can be found in the visualizations folder
cleanwords2 = ' '.join([word for word in cleanwords])
wc = wordcloud.WordCloud(width=1000, height=1000, random_state=24, 
              max_font_size=125, background_color='white').generate(cleanwords2)

In [23]:
#Using vaderSentiment

In [24]:
#Dropping the values we just generated with textblob
corpus_vader = corpus.drop(columns=['subjectivity', 'polarity', 'analysis'])

In [26]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [27]:
corpus_vader['scores'] = corpus_vader['text'].apply(analyzer.polarity_scores)

In [28]:
#Determining sentiment judgment from analyzer's scores

def get_analysis(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [29]:
#Compound score indicates overall polarity of sentiment as judged by analyzer
#We must apply the function above to this compound score

compounds = []
for i in range(4000):
    compounds.append(corpus_vader['scores'].iloc[i]['compound'])
corpus_vader['compounds'] = compounds

corpus_vader['analysis'] = corpus_vader['compounds'].apply(get_analysis)

In [30]:
#Positive, negative, neutral messages
vpos = corpus_vader[corpus_vader['analysis'] == 'positive']
vneg = corpus_vader[corpus_vader['analysis'] == 'negative']
vneu = corpus_vader[corpus_vader['analysis'] == 'neutral']

In [31]:
#Percentages for each category
vp = round((vpos.shape[0]/4000)*100, 1)
vn = round((vneg.shape[0]/4000)*100, 1)
vne = round((vneu.shape[0]/4000)*100, 1)

vpercentages = f"positive: {vp}%, negative: {vn}%, neutral: {vne}%"
print(vpercentages)

positive: 29.1%, negative: 13.2%, neutral: 57.7%


In [32]:
vpos.describe()

Unnamed: 0,length,charlength,compounds
count,1166.0,1166.0,1166.0
mean,3.92024,17.602916,0.434348
std,2.728892,12.666984,0.154157
min,1.0,2.0,0.0516
25%,2.0,7.0,0.3612
50%,3.0,15.0,0.4215
75%,5.0,25.0,0.5106
max,14.0,58.0,0.9571


In [33]:
vneg.describe()

Unnamed: 0,length,charlength,compounds
count,526.0,526.0,526.0
mean,4.328897,19.587452,-0.371499
std,2.536627,12.38259,0.162504
min,1.0,2.0,-0.8555
25%,2.0,10.0,-0.4767
50%,4.0,17.0,-0.3612
75%,6.0,27.0,-0.296
max,13.0,59.0,-0.0516


In [34]:
vneu.describe()

Unnamed: 0,length,charlength,compounds
count,2308.0,2308.0,2308.0
mean,3.445841,15.326256,8.8e-05
std,2.37238,11.14816,0.002553
min,1.0,1.0,-0.0296
25%,2.0,6.0,0.0
50%,3.0,13.0,0.0
75%,5.0,22.0,0.0
max,16.0,64.0,0.0387


In [35]:
#vS can better interpret online speech nuances
#Numerical/statistical features of each group remain largely the same between analyzers

In [36]:
#Finding instances where the two analyzers differ in their assessment

corpus2 = corpus.copy()
corpus_vader2 = corpus_vader.copy()

corpus2['analyzer'] = 'textblob'
corpus_vader2['analyzer'] = 'vaderSentiment'

corpus2.drop(columns=['polarity'], inplace=True)
corpus_vader2.drop(columns=['scores'], inplace=True)

#Indicating which analyzer returned the sentiment judgment in the column
corpus2.rename(columns={'analysis': 'textblob'}, inplace=True)
corpus_vader2.rename(columns={'analysis': 'vaderSentiment'}, inplace=True)

In [37]:
#Finding indices where judgments differ between the 2 analyzers
#Iterating over textblob, VADER judgments simultaneously, noting where they aren't the same

indices = []
for i in range(4000):
    if corpus2.iloc[i]['textblob'] != corpus_vader2.iloc[i]['vaderSentiment']:
        indices.append(i)

In [38]:
#Dropping the rows which do NOT differ
corpus2.drop(axis=0, index=[r for r in np.arange(0,4000) if r not in indices], inplace=True)
corpus_vader2.drop(axis=0, index=[r for r in np.arange(0,4000) if r not in indices], inplace=True)

In [39]:
#Adding to the textblob df an indication of which sentiment VADER chose for each message
corpus2['vaderSentiment'] = corpus_vader2['vaderSentiment']

In [40]:
#Looking at only the messages themselves
analyses = corpus2.drop(columns=['subjectivity', 'length', 'charlength', 'analyzer'])

In [41]:
analyses.head(20)

Unnamed: 0,text,tokens,textblob,vaderSentiment
1,youre right,"[youre, right]",positive,neutral
8,nothing really i was gonna afk a bit,"[nothing, really, i, was, gonna, afk, a, bit]",positive,neutral
15,yeah we eat you,"[yeah, we, eat, you]",neutral,positive
17,lolol,[lolol],positive,neutral
18,also playing is my fav kesha song,"[also, playing, is, my, fav, kesha, song]",neutral,positive
20,wild,[wild],positive,neutral
22,depressed,[depressed],neutral,negative
25,yes!,[yes!],neutral,positive
26,that was my problem,"[that, was, my, problem]",neutral,negative
27,it took me checking like 30 hqs!,"[it, took, me, checking, like, 30, hqs!]",neutral,positive


In [42]:
#Looking at most frequent tokens

In [43]:
#you and u are two of the most frequently occurring tokens
#I'm going to look at them from two different perspectives

In [44]:
#Treating you and u as different tokens
corpus_df = pd.DataFrame(cleanwords[0].split(), columns=['token'])
counts = corpus_df.value_counts().rename_axis('token').reset_index(name='counts')

In [45]:
#Treating you and u as the same token
corpus_df2 = corpus_df.copy()
corpus_df2.replace({'u':'you'}, inplace=True)
counts2 = corpus_df2.value_counts().rename_axis('token').reset_index(name='counts')

In [46]:
#Generated many plots and visuals which are located in the visualization folder 