In [1]:
# Standard data manipulation/visualization libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Custom text cleaning tool
import text_cleaner as cln

# Word clound library tools
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

# Sentiment analysis libraries
from textblob import TextBlob
import flair

# Interactive dashboard library
import streamlit

import csv_to_sqlite 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\uddin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\uddin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
input_file = "../data/raw_data/comments.csv"
raw_comments = pd.read_csv(input_file, lineterminator='\n')
raw_comments_replies = pd.read_csv('../data/raw_data/comment_replies.csv', lineterminator='\n')

In [14]:
data = raw_comments.drop(['author'], axis=1)
reply_data = raw_comments_replies.drop(['author'], axis=1)

# Preprocessing

In [15]:
cleaner = cln.TextCleaner()
cleaned_comments = cleaner.clean(data['text'].astype(str))
cleaned_replies = cleaner.clean(reply_data['replyText'].astype(str))

In [16]:
cleaned_data = data.copy()
cleaned_data['cleaned_comments'] = cleaned_comments

cleaned_reply_data = reply_data.copy()
cleaned_reply_data['cleaned_replies'] = cleaned_replies

In [17]:
cleaned_data.to_json('../data/cleaned_comments.json')
cleaned_reply_data.to_json('../data/cleaned_replies.json')

# Sentiment Analysis

In [None]:
sentiment_analysis = cleaned_data.copy()

## TextBlob pre-trained model

In [None]:
textblob_results = {"positive":0,"neutral":0,"negative":0}
sentiment_polarity = []
sentiment_subj = []
for x in cleaned_data['cleaned_comments']: 
    res = TextBlob(x).sentiment
    sentiment_polarity.append(res[0])
    sentiment_subj.append(res[1])
    if res[0] == 0.0: 
        textblob_results["neutral"] +=1
    elif res[0] > 0.0:
        textblob_results["positive"] +=1
    else:
        textblob_results["negative"] +=1
print(textblob_results)

In [None]:
labels = textblob_results.keys()
percentages = textblob_results.values()
explode = (0.1, 0, 0)

fig, ax = plt.subplots()
ax.pie(percentages, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax.axis('equal')

plt.show()

In [None]:
sentiment_analysis['textblob_polarity'] = sentiment_polarity
sentiment_analysis['textblob_subjectivity'] = sentiment_subj

## Flair pre-built model

In [None]:
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

In [None]:
flair_results = {"positive":0, "negative":0}
sentiment_val = []
sentiment_score = []
for x in cleaned_data['cleaned_comments']: 
    sentiment = flair.data.Sentence(x)
    flair_sentiment.predict(sentiment)
    res = sentiment.labels
    res = str(res[0]).replace('(', '').replace(')', '').split(' ')
    sentiment_val.append(res[0])
    sentiment_score.append(res[1])
    if res[0] == "POSITIVE":
        flair_results["positive"] +=1
    else:
        flair_results["negative"] +=1
print(flair_results)

In [None]:
labels = flair_results.keys()
percentages = flair_results.values()
explode = (0.1, 0)

fig, ax = plt.subplots()
ax.pie(percentages, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax.axis('equal')

plt.show()

In [None]:
sentiment_analysis['flair_sentiment'] = sentiment_val
sentiment_analysis['flair_score'] = sentiment_score

In [None]:
sentiment_analysis.to_json('../data/sentiment_analysis.json')

In [None]:
model_results = pd.DataFrame()
model_results = model_results.append(pd.DataFrame(data=[textblob_results.values()], columns=['positive', 'neutral', 'negative'], index=['textblob']))
model_results = model_results.append(pd.DataFrame(data=[flair_results.values()], columns=['positive', 'negative'], index=['flair']))
model_results = model_results.T
model_results.to_csv('../data/model_results.csv')

# Creating SQLite Database File

In [3]:
df = pd.read_json("../data/sentiment_analysis.json")

In [4]:
df.to_csv('../data/sentiment_analysis.csv', index=False)
options = csv_to_sqlite.CsvOptions(typing_style="full", encoding="utf-8") 
csv_to_sqlite.write_csv(['../data/sentiment_analysis.csv'], "sentiment_analysis_db.sqlite", options)

Written 171781 rows into 1 tables in 2.936 seconds


# Word Blob

In [None]:
df = pd.read_json("../data/sentiment_analysis.json")

In [None]:
mask = np.array(Image.open('../images/reeves.png'))

In [None]:
words = ''
for x in df['cleaned_comments']:
    words += "".join(str(x).strip("[]").replace("'", "").replace(",", ""))

In [None]:
wordcloud = WordCloud(background_color="white", max_words=10000, random_state=42, mask=mask).generate(words)

In [None]:
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[15,15])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
plt.axis("off")
plt.show()

In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sqlite3

In [50]:
vid_table = pd.read_csv('../data/raw_data/videos.csv')

In [67]:
vid.head()

Unnamed: 0,channelId,videoId,categoryId,title,viewCount,likeCount,dislikeCount,commentCount,publishedAt,description
0,UC4zyoIAzmdsgpDZQfO1-lSA,aWbQ5WcB0m4,20,Announcement Trailer - Cyber Up Your PC! Cyber...,551793,14324,740,1492,2020-04-07T14:45:17Z,Design the Cyberpunk 2077 PC Case of Your Drea...
1,UC4zyoIAzmdsgpDZQfO1-lSA,mrZC1Jcv0dw,20,Grimes – 4ÆM,2447237,104973,1958,7342,2019-12-19T15:06:33Z,"oo-\naphrodite, i wrote your constellation\nin..."
2,UC4zyoIAzmdsgpDZQfO1-lSA,Q4ZdkEiYvK0,20,Cyberpunk 2077 – Grimes performing 4ÆM live at...,3618884,34040,962,2758,2019-12-13T04:51:20Z,Watch Grimes – who will be also voicing Lizzy ...
3,UC4zyoIAzmdsgpDZQfO1-lSA,aZ_ARLDWK9Y,20,Cyberpunk 2077 – Behind The Music,413220,23218,340,2702,2019-12-13T02:57:19Z,Meet some of the artists behind Cyberpunk 2077...
4,UC4zyoIAzmdsgpDZQfO1-lSA,cgFvZmfjTYc,20,Cyberpunk 2077 — Official E3 2019 Cinematic Tr...,1015116,67731,465,3945,2019-09-11T14:01:45Z,CD PROJEKT RED and Goodbye Kansas present: the...


In [149]:
con = sqlite3.connect("sentiment_analysis_db.sqlite")
index = pd.read_sql_query("SELECT videoId "
                          "FROM sentiment_analysis GROUP BY videoId", con)
table1 = pd.read_sql_query("SELECT COUNT(textblob_polarity) as TextBlob_Positive "
                          "FROM sentiment_analysis WHERE textblob_polarity > 0 GROUP BY videoId", con)
table2 = pd.read_sql_query("SELECT COUNT(textblob_subjectivity) as TextBlob_Neutral "
                          "FROM sentiment_analysis WHERE textblob_subjectivity == 0 GROUP BY videoId", con)
table3 = pd.read_sql_query("SELECT COUNT(textblob_polarity) as TextBlob_Negative "
                          "FROM sentiment_analysis WHERE textblob_polarity < 0 GROUP BY videoId", con)

table4 = pd.read_sql_query("SELECT COUNT(flair_score) as Flair_Positive "
                          "FROM sentiment_analysis WHERE flair_sentiment == \"POSITIVE\" GROUP BY videoId", con)
table5 = pd.read_sql_query("SELECT COUNT(flair_score) as Flair_Negative "
                          "FROM sentiment_analysis WHERE flair_sentiment == \"NEGATIVE\" GROUP BY videoId", con)
con.close()

In [150]:
frames = [index, table1, table2, table3, table4, table5]
frames = pd.concat(frames, axis=1)

In [152]:
frames

Unnamed: 0,videoId,TextBlob_Positive,TextBlob_Neutral,TextBlob_Negative,Flair_Positive,Flair_Negative
0,8X2kIfS6fb8,6743,11967,4932,10806,13757
1,AN1RJF55NXI,589,632,274,744,790
2,FknHjl7eQ6o,3298,4047,2813,3871,6686
3,Igq3d6XA75Y,2175,2565,1100,2819,3244
4,P99qJGrPNLs,8611,11228,7022,10182,17633
5,Q4ZdkEiYvK0,588,568,332,707,822
6,SVAryZ0GLwE,526,474,302,551,801
7,aWbQ5WcB0m4,323,416,227,351,636
8,aZ_ARLDWK9Y,532,779,377,709,1027
9,cGmWwFpNIHg,238,385,182,321,515


In [158]:
table = frames.merge(vid_table, left_on='videoId', right_on='videoId').drop(['categoryId', 'channelId', 'description'],
                                                                           axis=1).sort_values(['publishedAt']).set_index('videoId')
table.head(2)

Unnamed: 0_level_0,TextBlob_Positive,TextBlob_Neutral,TextBlob_Negative,Flair_Positive,Flair_Negative,title,viewCount,likeCount,dislikeCount,commentCount,publishedAt
videoId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cGmWwFpNIHg,238,385,182,321,515,Cyberpunk 2077 title reveal,1128543,18364,227,1622,2012-10-19T08:13:43Z
P99qJGrPNLs,8611,11228,7022,10182,17633,Cyberpunk 2077 Teaser Trailer,16793930,274059,4852,39153,2013-01-10T20:58:07Z


In [139]:
for x,y in zip(range())

In [159]:
df = pd.read_csv('../data/model_results.csv')

In [162]:
df['Sentiment'].drop(1)

0    Positive
2    Negative
Name: Sentiment, dtype: object

track comment count engagement
positive/negative by video
like count by video
