In [2]:
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

import tensorflow.compat.v2 as tf

import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')
import re
import string


#https://www.kaggle.com/gpreda/all-covid19-vaccines-tweets

ModuleNotFoundError: No module named 'seaborn'

In [None]:
data = pd.read_csv("vaccination_all_tweets.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

# EDA

In [None]:
#MISSING DATA table

def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

missing_data(data)

In [None]:
#Most frequent values

def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        itm = data[col].value_counts().index[0]
        val = data[col].value_counts().values[0]
        items.append(itm)
        vals.append(val)
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))

most_frequent_values(data)

In [None]:
#Visualizations

def plot_count(feature, title, df, size=1, ordered=True):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    if ordered:
        g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    else:
        g = sns.countplot(df[feature], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()    

In [3]:
#USER LOCATION

plot_count("user_location", "User location", data,4)

NameError: name 'plot_count' is not defined

In [None]:
#WORDCLOUDS

from wordcloud import WordCloud, STOPWORDS
def show_wordcloud(data, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https", "amp", "U"])
    wordcloud = WordCloud(stopwords=stopwords, scale=4, max_font_size=50, max_words=500,background_color="black").generate(text)
    fig = plt.figure(1, figsize=(16,16))
    plt.axis('off')
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

In [None]:
#MOST PREVALENT WORDS IN TWEETS FROM THE USA

us_df = data.loc[data.user_location=="United States"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from US')

# SENTIMENT ANALYSIS

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
def find_sentiment(post):
    if sia.polarity_scores(post)["compound"] > 0:
        return "Positive"
    elif sia.polarity_scores(post)["compound"] < 0:
        return "Negative"
    else:
        return "Neutral" 

In [None]:
def plot_sentiment(df, feature, title):
    counts = df[feature].value_counts()
    percent = counts/sum(counts)

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

    counts.plot(kind='bar', ax=ax1, color='green')
    percent.plot(kind='bar', ax=ax2, color='blue')
    ax1.set_ylabel(f'Counts : {title} sentiments', size=12)
    ax2.set_ylabel(f'Percentage : {title} sentiments', size=12)
    plt.suptitle(f"Sentiment analysis: {title}")
    plt.tight_layout()
    plt.show()

In [None]:
data['sentiment'] = data['text'].apply(lambda x: find_sentiment(x))
plot_sentiment(data, 'sentiment', 'Text')

# SENTIMENT ANALYSIS VADER

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.drop(columns=['id', 'user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'hashtags', 'source', 'retweets', 'favorites',
       'is_retweet', 'sentiment'], axis=1, inplace=True)

In [None]:
data.head()

# preprocess text

In [None]:
#importing stopword from nltk
nltk.download('stopwords')
stopword = set(stopwords.words('english'))
print(stopword)

In [None]:
'''
Punkt Sentence Tokenizer

This tokenizer divides a text into a list of sentences
by using an unsupervised algorithm to build a model for abbreviation
words, collocations, and words that start sentences.  It must be
trained on a large collection of plaintext in the target language
before it can be used.

WordNet is used for lemmatizing 
'''

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

## Preprocessing Tweets
The Preprocessing steps taken are:

Lower Casing: Each text is converted to lowercase.

Removing URLs: Links starting with "http" or "https" or "www" are replaced by "".

Removing Usernames: Replace @Usernames with word "". (eg: "@XYZ" to "")

Removing Short Words: Words with length less than 2 are removed.

Removing Stopwords: Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. (eg: "the", "he", "have")

Lemmatizing: Lemmatization is the process of converting a word to its base form. (e.g: “wolves” to “wolf”)

In [None]:
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'

def process_tweets(tweet):
  # Lower Casing
    tweet = tweet.lower()
    tweet=tweet[1:]
    # Removing all URls 
    tweet = re.sub(urlPattern,'',tweet)
    # Removing all @username.
    tweet = re.sub(userPattern,'', tweet) 
    #Remove punctuations
    tweet = tweet.translate(str.maketrans("","",string.punctuation))
    #tokenizing words
    tokens = word_tokenize(tweet)
    #Removing Stop Words
    final_tokens = [w for w in tokens if w not in stopword]
    #reducing a word to its word stem 
    wordLemm = WordNetLemmatizer()
    finalwords=[]
    for w in final_tokens:
      if len(w)>1:
        word = wordLemm.lemmatize(w)
        finalwords.append(word)
    return ' '.join(finalwords)

In [None]:
#apply process_tweets function to each entry in the text feature
data['processed_tweets'] = data['text'].apply(lambda x: process_tweets(x))

In [None]:
data.head()

In [None]:
data.shape

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

def run_vader(text):
    return sid.polarity_scores(text)['compound']

data['vader_compound'] = data['processed_tweets'].apply(lambda x: run_vader(x))

In [None]:
#Descriptive stats on vader_compound score column

data['vader_compound'].describe()

In [None]:
#create a histogram to see distribution of vader_compound intensity scores
data['vader_compound'].hist()

In [None]:
#create a new column called, "polarity classification"(binary either positive or negative polarity)

'''
rule:

if vader_compound score >= 0.5: polarity = positive
else if vader_compound score <= -0.5: polarity = negative
'''

def sentiment_classification(row):
    classification = 100
    if row > 0.05:
        classification = 1 #positive
    elif row < -0.5:
        classification = 0
    return classification

#apply sentiment_classification function on each row in the vader_compound attribute

data['polarity'] = data['vader_compound'].apply(lambda x: sentiment_classification(x))

In [None]:
data.head()

In [None]:
data['polarity'].value_counts()

In [None]:
#remove neutral scores -0.05 < vader_compound < 0.05

data.drop(data.loc[data['polarity']==100].index, inplace=True)

In [None]:
data.shape

In [None]:
data['polarity'].value_counts()

In [None]:
#dataset is not balanced, use random under-sampling to balance dataset
#way more positives than negatives

positive_count_1, negative_count_0 = data['polarity'].value_counts()

In [None]:
positive_1 = data[data['polarity'] == 1]
negative_0 = data[data['polarity'] == 0]

print('positive:', positive_1.shape)
print('negative:', negative_0.shape)

In [None]:
positive_1_under = positive_1.sample(negative_count_0)
test_under = pd.concat([positive_1_under, negative_0], axis=0)

print("total class of 1 and0:",test_under['polarity'].value_counts())# plot the count after under-sampeling
test_under['polarity'].value_counts().plot(kind='bar', title='count (target)')

In [None]:
test_under.head()

In [None]:
test_under.shape

In [None]:
test_under.to_csv('balanced_twitter.csv')

# TOPIC MODELLING (LDA)