### Loading all needed packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob

### Loading the dataset with YouTube comments

In [2]:
# Create a dataframe from an Excel file
yt_comments = pd.read_excel("YT_comments_clean_only_tweets.xlsx")

### Exploring the dataset

In [3]:
# 9280 lines and 1 column
yt_comments.shape

(9280, 1)

In [4]:
# Overview of the dataset
yt_comments.head()

Unnamed: 0,Comments
0,"I cant wait for the release in Italy,"
1,"TÃ¼rkiye ye gelsin artÄ±k,"
2,"I have Disney+ Iâ€™m Saudi,"
3,"Us : Whatâ€™s next ?\nDisney : Nostalgia, that..."
4,"I would only buy for marvel,"


In [5]:
# Having the type of the column
yt_comments.dtypes

Comments    object
dtype: object

In [6]:
# Making sure the column is in string format
yt_comments['Comments'] = yt_comments['Comments'].astype(str)

### Step 1 - Lowercase all comments for the sentiment analysis

In [7]:
yt_comments['Comments'] = yt_comments['Comments'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [8]:
# Result for the line 0
yt_comments['Comments'][0]

'i cant wait for the release in italy,'

In [9]:
# Creating a duplicate of the dataframe, just so you can keep the original version without modifications
df = yt_comments

### Step 2 - Removing punctuation for the sentiment analysis

In [10]:
df['Comments'] = df['Comments'].str.replace('[^\w\s]','')

In [11]:
# Result for the line 0
df['Comments'][0]

'i cant wait for the release in italy'

### Step 3 - Removing stopwords for the sentiment analysis

In [12]:
stop = stopwords.words('english')
df['Comments'] = df['Comments'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [13]:
# Result for the line 0
df['Comments'][0]

'cant wait release italy'

### Step 4 - Stemming (process of reducing words to their word root form) for the sentiment analysis

In [14]:
st = PorterStemmer()
df['Comments'] = df['Comments'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [15]:
df['Comments'][0]

'cant wait releas itali'

In [16]:
# Creating a duplicate of the dataframe, just so you can keep the original version without modifications
df2 = df

### Step 5 - Running the sentiment classification

In [17]:
def senti(x):
    return TextBlob(x).sentiment  

df2['senti_score'] = df2['Comments'].apply(senti)

In [18]:
# Overview of the dataset with sentiment score (Polarity, Subjectivity)
df2.head()

Unnamed: 0,Comments,senti_score
0,cant wait releas itali,"(0.0, 0.0)"
1,tã¼rkiy ye gelsin artäk,"(0.0, 0.0)"
2,disney iâm saudi,"(0.0, 0.0)"
3,us whatâ next ndisney nostalgia thatâ specialti,"(0.0, 0.0)"
4,would buy marvel,"(0.0, 0.0)"


In [19]:
# Overview of the column senti_score (Polarity, Subjectivity)
sentiment = pd.DataFrame(df2['senti_score'])
sentiment.tail()

Unnamed: 0,senti_score
9275,"(0.5833333333333334, 0.23333333333333334)"
9276,"(0.0, 0.25)"
9277,"(0.0, 0.0)"
9278,"(0.23714285714285716, 0.5404761904761906)"
9279,"(0.0, 0.0)"


In [20]:
# Separating the Polarity and Subjectivity score in two columns
df_sentiment = pd.DataFrame(sentiment['senti_score'].values.tolist(), columns=['Polarity','Subjectivity'])
df_sentiment.tail()

Unnamed: 0,Polarity,Subjectivity
9275,0.583333,0.233333
9276,0.0,0.25
9277,0.0,0.0
9278,0.237143,0.540476
9279,0.0,0.0


In [21]:
# Counting the number of comments for each category of subjectivity
# Most of the comments have a score of zero meaning there are not subjective (objective)
subjectivity = {'Not subjective':[df_sentiment[df_sentiment["Subjectivity"]==0].count()["Subjectivity"]],
            'Slightly subjective' : [df_sentiment[df_sentiment["Subjectivity"].between(0,0.3333, inclusive = False)].count()["Subjectivity"]],
            'Quite subjective' : [df_sentiment[df_sentiment["Subjectivity"].between(0.3333,0.6666, inclusive = True)].count()["Subjectivity"]],
            'Subjective' : [df_sentiment[df_sentiment["Subjectivity"].between(0.6666,1, inclusive = False)].count()["Subjectivity"]],
            'Highly subjective':[df_sentiment[df_sentiment["Subjectivity"]==1].count()["Subjectivity"]]
           }

df_subjectivity = pd.DataFrame(subjectivity, columns = ['Not subjective', 'Slightly subjective', 'Quite subjective', 'Subjective', 'Highly subjective'])

df_subjectivity

Unnamed: 0,Not subjective,Slightly subjective,Quite subjective,Subjective,Highly subjective
0,5772,722,1626,698,462


In [22]:
# Counting the number of comments for each category of polarity
# Most of the comments are neutral with a polarity score of 0, the algorithm is not able to classify them in positive or negative opinion
polarity = {'Positive': [df_sentiment[df_sentiment["Polarity"]>0].count()["Polarity"]],
        'Negative': [df_sentiment[df_sentiment["Polarity"]<0].count()["Polarity"]],
         'Neutral': [df_sentiment[df_sentiment["Polarity"]==0].count()["Polarity"]]
        }

df_polarity = pd.DataFrame(polarity, columns = ['Positive', 'Negative', 'Neutral'])

df_polarity

Unnamed: 0,Positive,Negative,Neutral
0,2532,757,5991
