In [1]:
from schrutepy import schrutepy
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import streamlit as st
from matplotlib.pyplot import figure
import nltk
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from PIL import Image
import numpy as np
import collections
import pandas as pd
pd.set_option('display.max_rows',10000)
pd.set_option('display.max_columns',100)
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode,iplot
import plotly.graph_objects as go
import cufflinks as cf

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/traffic/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
st.title('Sentimental Analysis')

2022-03-09 00:13:43.440 
  command:

    streamlit run /home/traffic/.local/lib/python3.8/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

# Data Prep

In [3]:
#loading the data
df = pd.read_csv('analisis_comments_tiktok.csv')

#### Inspecting Data

In [4]:
print(df.shape)

(1096, 10)


In [5]:
df.head()

Unnamed: 0,date_post,date_extraction,influencer,post_type,post_url,platform,comment,comment_likecount,replies,BU;;;;;;;;;
0,"2021-9-12,2021-12-01,espdaniella,VIDEO,https:/...",,,,,,,,,
1,2021-9-12,2021-12-01,espdaniella,VIDEO,https://www.tiktok.com/@espdaniella/video/7007...,TikTok,it’s not good chegg apparently rats out on stu...,14.0,,CS;;;;;;;;;
2,"2021-9-11,2021-12-01,themccartys,VIDEO,https:/...",,,,,,,,,
3,2021-9-11,2021-12-01,themccartys,VIDEO,https://www.tiktok.com/@themccartys/video/7006...,TikTok,I hope Audri looks different at that age,71.0,,CS;;;;;;;;;
4,2021-9-11,2021-12-01,themccartys,VIDEO,https://www.tiktok.com/@themccartys/video/7006...,TikTok,I absolutely love it!,26.0,,CS;;;;;;;;;


In [6]:
df.describe()

Unnamed: 0,date_post,date_extraction,influencer,post_type,post_url,platform,comment,comment_likecount,replies,BU;;;;;;;;;
count,1096,1008,997,998,993,984,979,975,20,970
unique,144,19,34,11,51,9,896,38,19,7
top,2021-11-12,2021-12-01,sydneyserena,VIDEO,https://www.youtube.com/watch?v=V467rq8OVsc,TikTok,Hi,0,"CS"";;;;;;;;",CS;;;;;;;;;
freq,217,720,212,970,188,722,13,621,2,930


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   date_post          1096 non-null   object
 1   date_extraction    1008 non-null   object
 2   influencer         997 non-null    object
 3   post_type          998 non-null    object
 4   post_url           993 non-null    object
 5   platform           984 non-null    object
 6   comment            979 non-null    object
 7   comment_likecount  975 non-null    object
 8   replies            20 non-null     object
 9   BU;;;;;;;;;        970 non-null    object
dtypes: object(10)
memory usage: 85.8+ KB


#### Handling Data

In [8]:
df = df.loc[~df.comment.isna(),:]
print(df.shape[0])

979


VADER’s SentimentIntensityAnalyzer() takes in a string and returns a dictionary of scores in each of four categories:
negative
neutral
positive
compound (computed by normalizing the scores above
Let us analyze some random statements through our sentimental analyzer
a = 'This was a good movie.'

In [9]:
sid = SentimentIntensityAnalyzer()

df['scores'] = df['comment'].apply(lambda comment: sid.polarity_scores(comment))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')
df.loc[:,['comment','comp_score']]

Unnamed: 0,comment,comp_score
1,it’s not good chegg apparently rats out on stu...,neg
3,I hope Audri looks different at that age,pos
4,I absolutely love it!,pos
5,so this is what Audri will look like as a Fres...,pos
6,Oh my I hope not 😂,pos
7,Math is hard 😳😳😳,neg
8,This was me. I cannot MATH,pos
9,I can history all day long but I cannot math o...,pos
10,This is so neat! @Chegg for the win!,pos
11,That is a 7th grade problem😑 Won't work on my ...,pos


# EDA

In [32]:
init_notebook_mode(connected=True)
#labels
lab = df["comp_score"].value_counts().keys().tolist()
#values
val = df["comp_score"].value_counts().values.tolist()
trace = go.Pie(labels=lab, 
                values=val, 
                marker=dict(colors=['red']), 
                # Seting values to 
                hoverinfo="value"
              )
data = [trace]
layout = go.Layout(title="Sentiment Distribution")
fig = go.Figure(data = data,layout = layout)
iplot(fig)

### Sentiment Distribution per Influencer

In [39]:
for i in df.influencer.unique().tolist():
    dfi = df.loc[df.influencer==i,:].copy()
    lab = dfi["comp_score"].value_counts().keys().tolist()
    #values
    val = dfi["comp_score"].value_counts().values.tolist()
    trace = go.Pie(labels=lab, 
                    values=val, 
                    marker=dict(colors=['red']), 
                    # Seting values to 
                    hoverinfo="value"
                  )
    data = [trace]
    layout = go.Layout(title=f"{i}'s Sentiment Distribution")
    fig = go.Figure(data = data,layout = layout)
    iplot(fig)

### Influencer Rank by Sentiment

In [46]:
df_i = df.groupby(['influencer']).agg({'compound':'mean'}).reset_index()
df_i = df_i.sort_values('compound',ascending=False)
df_i

Unnamed: 0,influencer,compound
9,iamalilstitious,0.52858
18,ninjanerdscience,0.493969
1,as usual,0.4019
16,modern.day.classic,0.330317
26,sydneyserena,0.291964
7,espdaniella,0.229247
25,sarahrav,0.220103
15,medstudebt,0.2004
2,build joy,0.19824
10,itssozer,0.184557


### Focus: Negative Comments

In [49]:
df_n = df.loc[df.compound<0,:]

In [20]:
# defining data
trace = go.Histogram(x=df['platform'],nbinsx=40,histnorm='percent')
data = [trace]
# defining layout
layout = go.Layout(title="platform Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [21]:
import streamlit as st
import plotly.figure_factory as ff


# Plot!
st.plotly_chart(fig, use_container_width=True)

DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

In [22]:
# defining data
trace = go.Histogram(x=df['post_type'],nbinsx=40,histnorm='percent')
data = [trace]
# defining layout
layout = go.Layout(title="post_type Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [23]:
# defining data
trace = go.Histogram(x=df['compound'],nbinsx=40,histnorm='percent')
data = [trace]
# defining layout
layout = go.Layout(title="compound Distribution")
# defining figure and plotting
fig = go.Figure(data = data,layout = layout)
iplot(fig)

In [24]:
st.plotly_chart(fig, use_container_width=True)

DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

# Topic Modelling