In [1]:
import pandas as pd
from utils.utils import *
import os
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
pd.set_option('max_colwidth', None)

## Loading the data

In [2]:
df = pd.read_csv('data/twitter_scrape.csv')

In [3]:
df.head()

Unnamed: 0,TimeStamps,Screen name,Username,Tweets
0,2022-02-25 05:11:24+00:00,BiovoiceNews,BioVoice News,BIO-BUSINESS: Zydus Group announced its new brand identity with the listed entity of the group Cadila Healthcare Limited to be now known as #ZydusLifesciences Limited.\n\nMore: https://t.co/qAqZQbhRqT via #6yearsofBioVoiceNews @ZydusUniverse #zydus #pharma #biopharma #vaccine
1,2022-02-25 05:08:18+00:00,3rdworldnetwork,Third World Network,"#WTO: Dangers of ""take-it-or-leave-it"" compromise outcome on #TRIPSwaiver \nSouth Africa expressed concern that the delay in approving a TRIPS waiver is hampering efforts to diversify proper production of vaccines &amp; address ""#vaccine inequity.\n\n#covid19\n\n➡️https://t.co/eIxVzoo78L https://t.co/Mf8YXC15jF"
2,2022-02-25 05:06:03+00:00,NST_Online,New Straits Times,"#NSTnation State Health director Dr Othman Warijo said the case involved a 10-year-old girl with a history of asthma, who received her vaccine at the Sultanah Bahiyah Hospital.\n\n#Kedah #Children #Vaccine #Covid19 #PICKids \n\nhttps://t.co/Rg2WTa1sIQ"
3,2022-02-25 05:00:00+00:00,ppe_china,ChinaPPE,Non-Medical Face Mask KN95\nProtect you from non-oily airborne pollutants\n\n#immunity #vaccine #TestKit #glove #3plymask #ff2 #KN95 https://t.co/IdVuPqS1li
4,2022-02-25 04:59:49+00:00,Dog8Bone,Cryptogarb,https://t.co/4ySms7FGHF ForSale\n#Vaccine #COVID19 #Covid #Medical #Life #Domains #Technology #Tech #Science #medicine #Doctor #100DaysOfCode #Bot #Memes #BigData #Security #Cloud #javascript #java #datascience #MachineLearning #web3 #NFT #VC #Investor #socialmedia #branding #ai https://t.co/pXaSo88a1o


In [4]:
df['Tweets_corrected'] = df['Tweets'].apply(lambda x: sentence_cleaning(x))

## Lemmatization

In [5]:
os.environ['LANGUAGE_MODEL_SPACY'] = "en_core_web_sm"
lemmatizer = spacy.load(os.environ['LANGUAGE_MODEL_SPACY'])
df['lemmatized_tweets'] = df['Tweets_corrected'].apply(lambda row: ' '.join([x.lemma_ for x in lemmatizer(row)]))

## VADER

In [6]:
analyzer = SentimentIntensityAnalyzer()
df['VADER_compound'] = df['lemmatized_tweets'].apply(lambda row: analyzer.polarity_scores(row)['compound'])

In [7]:
def assign_sentiment(score):
    if score >= 0.1:
        return 'Positive'
    elif score <= -0.1:
        return 'Negative'
    else:
        return 'Neutral'

In [8]:
df['VADER_sentiment'] = df['VADER_compound'].apply(assign_sentiment)

In [9]:
df.head()

Unnamed: 0,TimeStamps,Screen name,Username,Tweets,Tweets_corrected,lemmatized_tweets,VADER_compound,VADER_sentiment
0,2022-02-25 05:11:24+00:00,BiovoiceNews,BioVoice News,BIO-BUSINESS: Zydus Group announced its new brand identity with the listed entity of the group Cadila Healthcare Limited to be now known as #ZydusLifesciences Limited.\n\nMore: https://t.co/qAqZQbhRqT via #6yearsofBioVoiceNews @ZydusUniverse #zydus #pharma #biopharma #vaccine,zydus group announced its new brand identity with the listed entity of the group cadila healthcare limited to be now known as zyduslifesciences limited more,zydus group announce -PRON- new brand identity with the list entity of the group cadila healthcare limit to be now know as zyduslifescience limited more,-0.2263,Negative
1,2022-02-25 05:08:18+00:00,3rdworldnetwork,Third World Network,"#WTO: Dangers of ""take-it-or-leave-it"" compromise outcome on #TRIPSwaiver \nSouth Africa expressed concern that the delay in approving a TRIPS waiver is hampering efforts to diversify proper production of vaccines &amp; address ""#vaccine inequity.\n\n#covid19\n\n➡️https://t.co/eIxVzoo78L https://t.co/Mf8YXC15jF",dangers of take it or leave it compromise outcome on tripswaiver south africa expressed concern that the delay in approving a trips waiver is hampering efforts to diversify proper production of vaccines amp address vaccine inequity covid,danger of take -PRON- or leave -PRON- compromise outcome on tripswaiver south africa express concern that the delay in approve a trip waiver be hamper effort to diversify proper production of vaccine amp address vaccine inequity covid,-0.7096,Negative
2,2022-02-25 05:06:03+00:00,NST_Online,New Straits Times,"#NSTnation State Health director Dr Othman Warijo said the case involved a 10-year-old girl with a history of asthma, who received her vaccine at the Sultanah Bahiyah Hospital.\n\n#Kedah #Children #Vaccine #Covid19 #PICKids \n\nhttps://t.co/Rg2WTa1sIQ",nstnation state health director dr othman warijo said the case involved a year old girl with a history of asthma who received her vaccine at the sultanah bahiyah hospital kedah children vaccine covid pickids,nstnation state health director dr othman warijo say the case involve a year old girl with a history of asthma who receive -PRON- vaccine at the sultanah bahiyah hospital kedah child vaccine covid pickid,0.0,Neutral
3,2022-02-25 05:00:00+00:00,ppe_china,ChinaPPE,Non-Medical Face Mask KN95\nProtect you from non-oily airborne pollutants\n\n#immunity #vaccine #TestKit #glove #3plymask #ff2 #KN95 https://t.co/IdVuPqS1li,non medical face mask kn protect you from non oily airborne pollutants immunity vaccine testkit glove plymask ff kn,non medical face mask kn protect -PRON- from non oily airborne pollutant immunity vaccine testkit glove plymask ff kn,0.6597,Positive
4,2022-02-25 04:59:49+00:00,Dog8Bone,Cryptogarb,https://t.co/4ySms7FGHF ForSale\n#Vaccine #COVID19 #Covid #Medical #Life #Domains #Technology #Tech #Science #medicine #Doctor #100DaysOfCode #Bot #Memes #BigData #Security #Cloud #javascript #java #datascience #MachineLearning #web3 #NFT #VC #Investor #socialmedia #branding #ai https://t.co/pXaSo88a1o,forsale vaccine covid covid medical life domains technology tech science medicine doctor daysofcode bot memes bigdata security cloud javascript java datascience machinelearning web nft vc investor socialmedia branding ai,forsale vaccine covid covid medical life domain technology tech science medicine doctor daysofcode bot memes bigdata security cloud javascript java datascience machinelearning web nft vc investor socialmedia brand ai,0.34,Positive


## Export dataset with VADER sentiment

In [10]:
export_df = df.loc[:, ['lemmatized_tweets', 'VADER_sentiment']].reset_index(drop=True)

In [11]:
export_df.head()

Unnamed: 0,lemmatized_tweets,VADER_sentiment
0,zydus group announce -PRON- new brand identity with the list entity of the group cadila healthcare limit to be now know as zyduslifescience limited more,Negative
1,danger of take -PRON- or leave -PRON- compromise outcome on tripswaiver south africa express concern that the delay in approve a trip waiver be hamper effort to diversify proper production of vaccine amp address vaccine inequity covid,Negative
2,nstnation state health director dr othman warijo say the case involve a year old girl with a history of asthma who receive -PRON- vaccine at the sultanah bahiyah hospital kedah child vaccine covid pickid,Neutral
3,non medical face mask kn protect -PRON- from non oily airborne pollutant immunity vaccine testkit glove plymask ff kn,Positive
4,forsale vaccine covid covid medical life domain technology tech science medicine doctor daysofcode bot memes bigdata security cloud javascript java datascience machinelearning web nft vc investor socialmedia brand ai,Positive


In [12]:
export_df.to_csv('data/tweets_processed.csv', index=False)