In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import spacy
import unidecode
import numpy as np
import pandas as pd
import nltk
import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
file = './app/data/6tOKJU9WevI.json'

df = pd.read_json(file)

In [None]:
del df['authorDisplayName']
del df['authorProfileImageUrl']
del df['authorChannelUrl']
del df['canRate']
del df['viewerRating']
del df['authorChannelId']
del df['updatedAt']
del df['textDisplay']
del df['likeCount']

In [None]:
df.head()

In [None]:
def parse_date(text):
    dateText = text.split('T')[0]
    return datetime.datetime.strptime(dateText, "%Y-%m-%d").date()

In [None]:
df["date"] = df["publishedAt"].apply(parse_date)
del df['publishedAt']

In [None]:
def replace_newline(text):
    text = text.replace('\n', '. ')
    return text

In [None]:
df["textOriginal"] = df["textOriginal"].apply(replace_newline)

In [None]:
df['sentences'] = df['textOriginal'].apply(sent_tokenize)

In [None]:
def clean_numbers(text):
 text = re.sub('\w*\d\w*', '', text)
 return text

In [None]:
sid_obj = SentimentIntensityAnalyzer()
def score_sentiment(sentences):
    compounds = []
    intensities = []
    for sentence in sentences:
        sentiment_dict = sid_obj.polarity_scores(sentence)
        if sentiment_dict['compound'] >= 0.05 :
            intensities.append(sentiment_dict["pos"])
        elif sentiment_dict['compound'] <= - 0.05 :
            intensities.append(sentiment_dict["neg"])
        compounds.append(sentiment_dict['compound'])

    comment_compound = np.average(compounds)
    intesity_average = np.average(intensities)
    
    if comment_compound >= 0.05:
        sentiment = "Positive"
    elif comment_compound <= -0.05:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    if np.isnan(intesity_average):
        intesity_average = 0.0
    
    return sentiment, intesity_average

In [None]:
df['sentiment'], df['intensity'] = zip(*df['sentences'].apply(score_sentiment))

In [None]:
df[df["intensity"] > 0.75].groupby('sentiment').intensity.count().plot(kind="bar", xlabel="Sentiment", ylabel="Number of high intensity comments")

In [None]:
labels = ["Positive", "Negative", "Neutral"]
sum = df['sentiment'].count()
neg_count = df[df['sentiment'] == 'Negative']['sentiment'].count()
neu_count = df[df['sentiment'] == 'Neutral']['sentiment'].count()
pos_count = df[df['sentiment'] == 'Positive']['sentiment'].count()
sizes = [pos_count/sum, neg_count/sum, neu_count/sum]

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')

plt.show()

In [None]:
def clean_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

In [None]:
def clean_punctuation(text):
    text = re.sub(r'\d+', '', text)
    text = str(text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [None]:
nlp = spacy.load('en_core_web_lg')
STOPWORDS = nlp.Defaults.stop_words
STOPWORDS.add('like')
def remove_stopwords(text):
    tokenized_text = text.split(' ')
    return ' '.join([w for w in tokenized_text if not w in STOPWORDS and len(w) > 1])

In [None]:
def lemmatization(text):
    doc = nlp(text)
    return ' '.join([w.lemma_ for w in doc])

In [None]:
def remove_adjectives_adverbs_verbs(text):
        return ' '.join(word.text for word in nlp(text) if not (word.pos_ == 'VERB' or word.pos_ == 'ADV'  or word.pos_ == 'ADJ') and not (word.text == "nt" or word.text == "ve"))

In [None]:
df['cleanSentence'] = df['textOriginal'].str.lower()
df['cleanSentence'] = df['cleanSentence'].apply(clean_punctuation)
df['cleanSentence'] = df['cleanSentence'].apply(remove_adjectives_adverbs_verbs)
df['cleanSentence'] = df['cleanSentence'].apply(remove_stopwords)
df['cleanSentence'] = df['cleanSentence'].apply(lemmatization)
df['cleanSentence'].replace('', float("NaN"), inplace=True)
df.dropna(subset=['cleanSentence'], inplace=True)
df['keywords'] = df['cleanSentence'].apply(word_tokenize)
del df['cleanSentence']

In [None]:
df.head()

In [None]:
def create_token_array(values):
    tokens = []
    for tokenizedSentece in values:
        for token in tokenizedSentece:
            tokens.append(token)
    return tokens

In [None]:
def create_frequency_map(tokens):
    fdist = nltk.FreqDist(tokens)
    return fdist.most_common(50)

In [None]:
pos_tokens = create_token_array(df[df['sentiment'] == 'Positive']['keywords'].values)
neg_tokens = create_token_array(df[df['sentiment'] == 'Negative']['keywords'].values)
neu_tokens = create_token_array(df[df['sentiment'] == 'Neutral']['keywords'].values)
pos_bigrams = nltk.bigrams(pos_tokens)
neg_bigrams = nltk.bigrams(neg_tokens)
neu_bigrams = nltk.bigrams(neu_tokens)
pos_freq = create_frequency_map(pos_tokens)
neg_freq = create_frequency_map(neg_tokens)
neu_freq = create_frequency_map(neu_tokens)
pos_bigram_freq = create_frequency_map(pos_bigrams)
neg_bigram_freq = create_frequency_map(neg_bigrams)
neu_bigram_freq = create_frequency_map(neu_bigrams)

In [None]:
def context_sentiment(pos_freq, neg_freq, neu_freq):
    result = dict()
    for k, v in pos_freq:
        result[k] = [v, 0, 0]
    for k,v in neu_freq:
        try:
            result[k][1] = v
        except:
            result[k] = [0, v, 0]
    for k,v in neg_freq:
        try:
            result[k][2] = v
        except:
            result[k] = [0, 0, v]
    return result

In [None]:
def convert_dict_to_dataframe(dict):
    df = pd.DataFrame(dict.items(), columns=['word','frequencies'])
    df[['positive', 'neutral', 'negative']] = pd.DataFrame(df['frequencies'].tolist(), index=df.index)
    return df

In [None]:
bicontext_sentiment_dict = context_sentiment(pos_bigram_freq, neg_bigram_freq, neu_bigram_freq)
df_bigram = convert_dict_to_dataframe(bicontext_sentiment_dict)

In [None]:
context_sentiment_dict = context_sentiment(pos_freq, neg_freq, neu_freq)
df_words = convert_dict_to_dataframe(context_sentiment_dict)

In [None]:
def normalize(frequencies):
    v = np.array(frequencies)
    v = v * (1 / np.linalg.norm(v))
    return v[0] * 1 + v[1] * 0.01 + v[2] * (-1)

In [None]:
df_bigram['score'] = df_bigram['frequencies'].apply(normalize)
df_words['score'] = df_words['frequencies'].apply(normalize)

In [None]:
def score_context_sentiment(compound):
    if compound == 0.01:
        result = "Neutral"
    elif compound > 0.5:
        result = "Mostly positive"
    elif compound > 0.1:
        result = "Slightly positive"
    elif compound > -0.1:
        result = "Controversial"
    elif compound > -0.5:
        result = "Slightly negative"
    else:
        result = "Mostly negative"
    return result

In [None]:
df_bigram['context_sentiment'] = df_bigram['score'].apply(score_context_sentiment)
df_words['context_sentiment'] = df_words['score'].apply(score_context_sentiment)

In [None]:
df_words.groupby("context_sentiment").head(2)

In [None]:
df_bigram.groupby("context_sentiment").head(2)

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
df[['date', 'sentiment']][df['sentiment'] == 'Positive'].groupby('date').count().plot(ax=ax, color='green')
df[['date', 'sentiment']][df['sentiment'] == 'Negative'].groupby('date').count().plot(ax=ax, color='red')
ax.set_xlabel("Date")
ax.set_ylabel("Number of comments posted")
ax.legend(['Positive', 'Negative'])

In [None]:
df.loc[(df['sentiment'] == 'Positive') | (df['sentiment'] == 'Negative')].groupby('date').intensity.apply().mean().plot(xticks=pd.date_range(df.groupby('date').date.min().min(), df.groupby('date').date.max().max(), periods=7), xlabel="Date of comments posted", ylabel = "Average intensity of comments", yticks=np.arange(0, 1, 0.1))

In [None]:
df.loc[(df['sentiment'] == 'Positive') | (df['sentiment'] == 'Negative')].groupby('date').filter(lambda x: x['intensity'].count() > 3)

In [None]:
plt.figure(figsize=(10,5))
df.loc[(df['sentiment'] == 'Positive') | (df['sentiment'] == 'Negative')].groupby('date').filter(lambda x: x['intensity'].count() > 1).groupby('date').intensity.min().plot(xlabel="Date of comments posted", ylabel = "Average intensity of comments", yticks=np.arange(0, 1, 0.1))
plt.show()