In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
import random

from plotly.subplots import make_subplots
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from wordcloud import WordCloud, STOPWORDS
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

pyo.init_notebook_mode()
nltk.download('vader_lexicon')

plt.rc('figure', figsize=(17, 13))

In [2]:
f_data = pd.read_csv("C:\\Users\\USER\\Downloads\\archive\\vaccination_tweets.csv")
f_data.text = f_data.text.str.lower()
f_data.text = f_data.text.apply(lambda x:re.sub('@[^\s]+', '', x))
f_data.text = f_data.text.apply(lambda x:re.sub(r'\B#\S+', "", x))
f_data.text = f_data.text.apply(lambda x:re.sub(r"http\S+", "", x))
f_data.text = f_data.text.apply(lambda x:''.join(re.findall(r'\w+', x)))
f_data.text = f_data.text.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
f_data.text = f_data.text.apply(lambda x:re.sub(r'\s+', '', x,flags=re.I))

In [None]:
sid = SIA()
f_data['sentences'] = f_data['text'].apply(lambda x:sid.polarity_scores(''.join(re.findall(r'\w+'.lower()))))
f_data['Positive sentiment'] = f_data['sentiments'].apply(lambda x: x['pos']+1*(10**-6))
f_data['Neutral Sentiment'] = f_data['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
f_data['Negative Sentiment'] = f_data['sentiments'].apply(lambda x: X['neg']+1(10**-6))

f_data.drop(columns=['sentiments'], inplace=True)

# Exploraory Data Analysis

In [None]:
plt.subplot(2, 1, 1)
plt.title('Distribution of sentiments', fontsize = 19, fontweight='bold')
sns.kdeplot(f_data['Negative Sentiment'], bw=0.1)
sns.kdeplot(f_data['Positive Sentiment'], bw=0.1)
sns.kdeplot(f_data['Neutral Sentiment'], bw=0.1)
plt.xlabel('Sentiment Value', fontsize=19)
plt.subplot(2, 1, 2)
plt.title('Sentiments Across Tweets', fontsize=19, fontweight='bold')
sns.kdeplot(f_data['Negative Sentiment'], bw=0.1, cumulative=True)
sns.kdeplot(f_data['Positie Sentiment'], bw=0.1, cumulative=True)
sns.kdeplot(f_data['Neutral Sentiment'], bw=0.1, cumulative=True)
plt.xlabel('Sentiment Value', fontsize=19)
plt.show()

# Analyze Sentiments

In [None]:
f_data = f_data.sort_values(by='date')
ft_data = f_data.copy()
ft_data['date'] = pd.to_datetime(f_data['date']).dt.date
ft_data['year'] = pd.DatetimeIndex(ft_data['date']).year
ft_data['month'] = pd.DatetimeIndex(ft_data['date']).month
ft_data['day'] = pd.DatetimeIndex(ft_data['date']).day
ft_data['day_of_year'] = pd.DatetimeIndex(ft_data['date']).dayofyear
ft_data['quater'] = pd.DatetimeIndex(ft_data['date']).quater

plt.subplot(2, 1, 1)
plt.title('A Cut-Off of Most Negative/ Positive Tweets', fontsize=10, fontweight='bold')

ax0 = sns.kdeplot(f_data['Negative Sentiment'], bw=0.1)
kde_x, kde_y = ax0.lines[0].get_data()
ax0.fill_between(kde_x, kde_y, where=(kde_x>0.25), interpolate=True, color='b')

plt.annotate('Cut-off for Most Negative Tweets', xy=(0.25, 0.5), xytext = (0.4, 2), arrowprops=dict(facecolor='red', shrink=0.05, fontsize=16, fontweight='bold'))
ax0.axvline(f_data['Negative Sentiment'].mean(), color='r', linestyle='--')
ax0.axvline(f_data['Negative Sentiment'].median(), color='tab:orange', linestyle='-')
plt.legend({'PDF': f_data['Negative Sentiment'], r'Mean: (:.2f)'.format(f_data['Negative Sentiment'].mean()):f_data['Negative Sentiment'].mean(),
           r'Median: {:,.2f}'.format(f_data['Negative Sentiment'].median()):f_data['Negative Sentiment'].median()})

plt.subplot(2, 1, 2)
ax1 = sns.kdeplot(f_data['Positive Sentiment', bw=0.1, color='green')

plt.annotate("Cut-Off for Most Positive Tweets", xy=(0.4, 0.43), xytext=(0.4, 2),
            arrowprops=dict(facecolor='red', shrink=0.05), fontsize=16, fontweight='bold'))
kde_x, kde_y = ax1.line[0].get_data()
ax1.fill_between(kde_x, kde_y, where = (kde_x>0.4), interpolate=True, color='green')
ax1.set_xlabel('Sentiment Strength', fontsize=18)

ax1.axvline(f_data['Positive Sentiment'].mean(), color='r', linestyle='--')
ax1.axvline(f_data['Positive Sentiment'].median(), color= 'tab:orange', linestyle='-')
plt.legend({'PDF':f_data['Positive Sentiment'], r'Mean: {:.2f}' .format(f_data['Positive Sentiment'].mean()):f_data['Positive Sentiment'].mean(),
            r'Median: {:.2f}'.format(f_data['Positive Sentiment'].median()):f_data['Positive Sentiment'].median()})

plt.show()

# Visualize Most Negative and Positive Sentiments

In [None]:
most_positive = f-data[f_data['Positive Sentiment'].between(0.4, 1)]
most_negative = f-data[f_data['Negative Sentiment'].between(0.25, 1)]

most_positive_text = ''.join(most_positive.text)
most_negative_text = ''.join(most_negative.text)

pwc = WordCloud(width=600, height=400, collocations = False).generate(most_positive_text)
nwc = WordCloud(width=600, height=400, collocations = False).generate(most_negative_text)

plt.subplot(1, 2, 1)
plt.title("Common words among Positive Tweets", fontsize = 16, fontweight='bold')
plt.imshow(pwc)
plt.axis('off')
plt.subplot(1, 2, 2)
plt.title("Common Words among Negative Tweets", fontsize = 16, fontweight='bold')
plt.imshow(pwc)
plt.axis('off')

plt.show()

In [None]:
l_t = most_positive_text

wl_dict = dict()
for word in l_t.split():
    w = word.strip()
    if w in STOPWORDS:
        continue
    else:
        wl_dict[w] = wl_dict.get(w, 0)+1
wl_dict = {k: v for k, v in sorted(wl_dict.items(), key= lambda item:item[1], rverse=True)}

l_t = most_negative_text
w2_dict = dict()
for word in l_t.split():
    w = word.strip()
    if w in STOPWORDS:
        continue
    else:
        w2_dict[w] = w2_dict.get(w, 0)+1
w2_dict = {k: v for k, v in sorted(w2_dict.items(), key=lambda item; item[1], reverse= True)}

 top_10_pos = list(wl_dict.keys())[:10]
top_10_neg = list(w2_dict.keys())[:10]

plt.subplot(1, 2, 1)
w_c = WordCloud(width=600, height=400, collocations = False, colormap='nipy_spectral').generate(''.join(top_10_pos))
plt.title('Top 10 words in Most Positive tweets', fontsize = 19. fontweight='bold')
plt.imshow(w_c)
plt.axis('off')
plt.subplot(1, 2, 2)
w_c = w_c = WordCloud(width=600, height=400, collocations = False, colormap='nipy_spectral').generate(''.join(top_10_neg))
plt.title('Top 10 words in Most Negative tweets', fontsize = 19. fontweight='bold')
plt.imshow(w_c)
plt.axis('off')

plt.show()