<a href="https://colab.research.google.com/github/beedrumms/Vax-Tweets-in-Ontario/blob/main/Preprocessing_Vax_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import essential packages for basic processing
import re 
import string
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from datetime import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('punkt')

import gensim
from gensim.parsing.preprocessing import remove_stopwords


import sklearn 
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from collections import defaultdict
from pprint import pprint

from google.colab import drive, files
import os

# Mounting google colab on drive 
drive.mount("/content/drive")

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/vaccine_tweets.csv')
tweets = data.iloc[:, [3,10]]
tweets.columns = ["Datetime", "Text"]
print(len(tweets['Text']))
tweets.drop_duplicates("Text", inplace = True)
tweets[:3], print(len(tweets['Text']))

#Preprocessing

In [None]:
# Cleaning functions for text data
import re 
import string
def clean(tweets):
    tweets = tweets.lower()
    tweets = re.sub('https://[A-Za-z0-9./]+','',tweets) #remove links 
    tweets = re.sub('https//[A-Za-z0-9./]+','',tweets) #remove broken links that do not have a colon
    tweets = re.sub(r'w{3}.\w+.\w+','',tweets) # remove other links with www
    tweets = re.sub('[\\r\\n?|\\n]','',tweets) # remove carrige returns and new lines
    tweets = re.sub('#[a-zA-Z0-9-_]+', '', tweets) # remove all hashtags
    tweets = re.sub('\s[&]amp[;]\s',' and ',tweets)
    tweets = re.sub('@\w{0,18}[a-zA-Z0-9_]','', tweets) # removing all twitter handles
    tweets = re.sub('[#\'"%\-_.?!,;:&/\\*\]]', '',  tweets) # remove all punctuation
    tweets = re.sub('[’‘“”]', '', tweets) # removing this weird apostrophe that a regular apostrophe wont get rid of  
    tweets = re.sub('[0-9’]', '', tweets) # remove digits 
    return tweets

cleaning = lambda x: clean(x)

def removing_emojis(text):
  regrex_pattern = re.compile(pattern = "["
  u"\U0001F600-\U0001F64F" 
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
                           "]+", flags = re.UNICODE)
  return regrex_pattern.sub(r'',text)

emoji_removal = lambda x: removing_emojis(x)

In [None]:
tweets_df = tweets.copy() # copying dataframe
tweets_for_sentiments = tweets_df['Text'] # copying text data so all datasets will have original and processed tweets 
tweets_df['Text_Unprocessed'] = tweets_for_sentiments # adding it back 
tweets_df.Text = pd.DataFrame(tweets_df.Text.apply(cleaning))
tweets_df.Text = pd.DataFrame(tweets_df.Text.apply(emoji_removal))

In [None]:
# This function leaves caps and some puncuation for better performance in sentiment analyzer
def basic_clean(tweets):
    tweets = re.sub('https://[A-Za-z0-9./]+','',tweets) #remove links 
    tweets = re.sub('https//[A-Za-z0-9./]+','',tweets) #remove broken links that do not have a colon
    tweets = re.sub(r'w{3}.\w+.\w+','',tweets) # remove other links with www
    tweets = re.sub('[\\r\\n?|\\n]','',tweets) # remove carrige returns and new lines
    tweets = re.sub('#[a-zA-Z0-9-_]+', '', tweets) # remove all hashtags
    tweets = re.sub('\s[&]amp[;]\s',' and ',tweets)
    tweets = re.sub('@\w{0,18}[a-zA-Z0-9_]','', tweets) # remove remaining usernames
    tweets = re.sub('[#\'"%\-_;\\*\]]', '',  tweets) # remove some punctuation
    tweets = re.sub('[’“”]', '', tweets) # removing this weird apostrophe that a regular apostrophe wont get rid of  
    tweets = re.sub('[0-9’]', '', tweets) # remove digits 
    return tweets

basic_cleaning = lambda x: basic_clean(x)

In [None]:
tweets_df.Text_Unprocessed = pd.DataFrame(tweets_df.Text_Unprocessed.apply(basic_cleaning))

In [None]:
# removing stop words
tweets_no_stopwords = []
for t in tweets_df['Text']:
  tweets_no_stopwords.append(remove_stopwords(t))

# Removing white spaces
removed_spaces = []
for t in tweets_no_stopwords:
  removed_spaces.append(t.strip())

# ensure tweets are a string
tweet_string = [] 
for t in removed_spaces:
  tweet_string.append(str(t))

In [None]:
# Installing spacy model
!pip install -U spacy
!python -m spacy validate
!python -m spacy download en_core_web_md

In [None]:
# Lemmatize 
import spacy 
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
docs_processed = [list(nlp(t)) for t in tweet_string]

In [None]:
# Recreate sentences with lemma forms
lemma_form = []
lemma_sentences = [] 
for t in docs_processed:
  for w in t: 
    lemma_form.append(w.lemma_)
  lemma_sentences.append(' '.join(lemma_form))
  lemma_form = []

final_texts = [] 
for t in lemma_sentences:
  final_texts.append(str(t))

# Overwrite text data in dataframe with lemma versions of tweets
tweets_df['Text'] = final_texts

In [None]:
tweets_df.to_csv('/content/drive/MyDrive/Colab Notebooks/processed_vax_tweets.csv')