### Preprocessing

In [1]:
import pandas as pd 
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag 
import spacy
from emot.emo_unicode import EMOTICONS_EMO
from textblob import Word
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [2]:
#Loading the twitter data
covid_tweets = pd.read_csv('covid_tweets_2021.csv', index_col=[0])

In [3]:
#Removing html code at the end of multiple rows
covid_tweets['Text'] = covid_tweets['Text'].str.split('https',expand=True)[0]

In [4]:
#Removing spaces, punctuations, lowering letters , low frequency words

def preprocessing(df, column):
    df[column] = df[column].apply(lambda x: x.lower())           
    df[column] = df[column].apply(lambda x: re.sub(r"\W"," ", x))
    df[column] = df[column].apply(lambda x: re.sub(r"\d","", x))
    df[column] = df[column].apply(lambda x: re.sub(r"^\s","", x))
    df[column] = df[column].apply(lambda x: re.sub(r"\s$","", x))
    emojis = list(EMOTICONS_EMO.keys())
    df[column] = df[column].apply(lambda x: ''.join([c for c in x if c not in emojis]))
    df[column] = df[column].apply(lambda x: ''.join([c for c in x if c not in string.punctuation]))
    low_frequency_words = pd.Series(' '.join(df[column]).split()).value_counts()[-1000:]
    df[column] = df[column].apply(lambda x: " ".join(x.lower() for x in x.split()if x not in low_frequency_words))

In [5]:
preprocessing(covid_tweets,column='Text')

In [6]:
#Removing stopwords with nltk
english_stopwords = stopwords.words('english')
covid_tweets["Text"]=covid_tweets["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in english_stopwords))

In [7]:
#Removing stopwords with spacy
sp = spacy.load('en_core_web_sm')
sp_stopwords = sp.Defaults.stop_words
covid_tweets["Text"]=covid_tweets["Text"].apply(lambda x: " ".join(x for x in x.split() if x not in sp_stopwords))

### Adjectives with Tokenizer

In [8]:
#Creating a column for adjectives in the tweet

def getAdjectives(tweet):
    tweet = word_tokenize(tweet) 
    #JJ : stands for adjectives
    tweet = [word for (word, tag) in pos_tag(tweet) if tag == "JJ"]  
    return " ".join(tweet)  

covid_tweets['Adjectives']=covid_tweets['Text'].apply(lambda x: getAdjectives(x))

In [9]:
# Lemmatisation 
covid_tweets['Text']=covid_tweets["Text"].apply(lambda x: " ".join([Word(i).lemmatize() for i in x.split()]))

In [10]:
covid_tweets.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username,Adjectives
0,2021-03-31 23:59:58+00:00,1377410374551568387,new study ontario make wonder younger child k ...,freespiritus,new wear
1,2021-03-31 23:59:56+00:00,1377410366032932865,johnson amp johnson covid vaccine dos delayed ...,paldhous,delayed
2,2021-03-31 23:59:55+00:00,1377410361360482304,ontario icu overwhelmed record high occupancy ...,StevenDelDuca,high red grave
3,2021-03-31 23:59:53+00:00,1377410351633940484,american getting high speed internet new biden...,hardknoxfirst,american high new
4,2021-03-31 23:59:50+00:00,1377410338572759041,knew case rising france going lockdown smh par...,SpencerKarter,lockdown smh covid lockdown


In [11]:
covid_tweets.to_csv('preprocessed_tweets.csv')