# COVID-19 Vaccine Twitter Data Wrangling

In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [2]:
# Import/Load Data - CSV file: 'covidvaccine.csv'
file = 'covidvaccine.csv'
df = pd.read_csv(file)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Initial look at the data

In [3]:
df.shape

(328619, 13)

In [4]:
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64.0,11.0,110.0,False,18-08-2020 12:55,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,False
1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1.0,17.0,0.0,False,18-08-2020 12:55,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,False
2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143.0,566.0,8.0,False,18-08-2020 12:46,Deaths due to COVID-19 in Affected Countries\n...,,Twitter Web App,False
3,Zane,,Fresher than you.,18-09-2019 11:01,29.0,25.0,620.0,False,18-08-2020 12:45,@Team_Subhashree @subhashreesotwe @iamrajchoco...,,Twitter for Android,False
4,Ann-Maree O’Connor,"Adelaide, South Australia",Retired university administrator. Melburnian b...,24-01-2013 14:53,83.0,497.0,10737.0,False,18-08-2020 12:45,@michellegrattan @ConversationEDU This is what...,,Twitter Web App,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328619 entries, 0 to 328618
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   user_name         328613 non-null  object 
 1   user_location     286675 non-null  object 
 2   user_description  317700 non-null  object 
 3   user_created      197198 non-null  object 
 4   user_followers    197197 non-null  float64
 5   user_friends      197197 non-null  object 
 6   user_favourites   197197 non-null  object 
 7   user_verified     197197 non-null  object 
 8   date              197195 non-null  object 
 9   text              197197 non-null  object 
 10  hashtags          135581 non-null  object 
 11  source            194798 non-null  object 
 12  is_retweet        197189 non-null  object 
dtypes: float64(1), object(12)
memory usage: 32.6+ MB


In [6]:
# Set 'date' and 'user_created' columns as Datetime
df['date'] = pd.to_datetime(df.date, errors='coerce').dt.strftime('%Y-%m-%d %H:%M')
df['date'] = pd.to_datetime(df.date)

df['user_created'] = pd.to_datetime(df.user_created, errors='coerce').dt.strftime('%Y-%m-%d %H:%M')
df['user_created'] = pd.to_datetime(df.user_created)

In [7]:
# Set 'user_friends' and 'user_favourites' as float
df['user_friends'] = pd.to_numeric(df.user_friends, errors='coerce')
df['user_favourites'] = pd.to_numeric(df.user_favourites, errors='coerce')

In [8]:
# Set 'user_verified' and 'is_retweet' columns as bool
df['user_verified'] = df['user_verified'].astype(bool)
df['is_retweet'] = df['is_retweet'].astype(bool)

In [9]:
# Check datatypes
df.dtypes

user_name                   object
user_location               object
user_description            object
user_created        datetime64[ns]
user_followers             float64
user_friends               float64
user_favourites            float64
user_verified                 bool
date                datetime64[ns]
text                        object
hashtags                    object
source                      object
is_retweet                    bool
dtype: object

## Check missing values

In [10]:
# Check missing values in columns
def missing_values():
    missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
    missing.columns = ['count', '%']
    missing.sort_values(by=['count','%'], inplace=True)
    return missing

In [11]:
missing_values()

Unnamed: 0,count,%
user_verified,0,0.0
is_retweet,0,0.0
user_name,6,0.001826
user_description,10919,3.322693
user_location,41944,12.763717
user_followers,131422,39.99221
text,131422,39.99221
user_created,131428,39.994036
user_friends,131429,39.99434
user_favourites,131429,39.99434


Wow, there are several fields that have around 131,428 missing values.  
  
Let's check to see if there are duplicate entries.

&nbsp;

In [12]:
df[df.duplicated(keep=False)]

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
64305,Fay Moody,Newcastle,A good head and a good heart are always a form...,NaT,,,,True,NaT,,,,True
185732,Fay Moody,Newcastle,A good head and a good heart are always a form...,NaT,,,,True,NaT,,,,True
185733,Fay Moody,Newcastle,A good head and a good heart are always a form...,NaT,,,,True,NaT,,,,True
185734,Fay Moody,Newcastle,A good head and a good heart are always a form...,NaT,,,,True,NaT,,,,True
185735,Fay Moody,Newcastle,A good head and a good heart are always a form...,NaT,,,,True,NaT,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
327675,Chelsea Baird,"Dundas, Ontario",waiting for the next big thing to happen in my...,NaT,,,,True,NaT,,,,True
327676,Chelsea Baird,"Dundas, Ontario",waiting for the next big thing to happen in my...,NaT,,,,True,NaT,,,,True
327677,Chelsea Baird,"Dundas, Ontario",waiting for the next big thing to happen in my...,NaT,,,,True,NaT,,,,True
327678,Chelsea Baird,"Dundas, Ontario",waiting for the next big thing to happen in my...,NaT,,,,True,NaT,,,,True


In [13]:
df_dups = df[df.duplicated(keep=False)]
dup_count = df_dups.duplicated(keep=False).groupby(df_dups['user_name']).value_counts()

print(dup_count)
print("Total number of duplicates: " + str(dup_count.sum())) 

user_name          
Chelsea Baird  True    62504
Fay Moody      True    10877
Mr. W. L.      True    58037
dtype: int64
Total number of duplicates: 131418


We found 131,418 duplicate records spread across 3 user_names.  
  
We will go ahead and drop them from our main dataframe.

&nbsp;

In [14]:
df.drop_duplicates(keep=False, inplace=True)

In [15]:
# Recheck missing values
missing_values()

Unnamed: 0,count,%
user_verified,0,0.0
is_retweet,0,0.0
user_followers,4,0.002028
text,4,0.002028
user_name,6,0.003043
user_created,10,0.005071
user_friends,11,0.005578
user_favourites,11,0.005578
date,11,0.005578
source,2403,1.218554


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197201 entries, 0 to 328618
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_name         197195 non-null  object        
 1   user_location     155257 non-null  object        
 2   user_description  186282 non-null  object        
 3   user_created      197191 non-null  datetime64[ns]
 4   user_followers    197197 non-null  float64       
 5   user_friends      197190 non-null  float64       
 6   user_favourites   197190 non-null  float64       
 7   user_verified     197201 non-null  bool          
 8   date              197190 non-null  datetime64[ns]
 9   text              197197 non-null  object        
 10  hashtags          135581 non-null  object        
 11  source            194798 non-null  object        
 12  is_retweet        197201 non-null  bool          
dtypes: bool(2), datetime64[ns](2), float64(3), object(6)
memory

In [17]:
# Let's filter and check the records where there are missing values in the 'user_name' and 'date' column
df.loc[df[['user_name','date']].isnull().any(axis=1)]

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
23986,#edutwitter #CovidVaccine,"['edutwitter', 'CovidVaccine']",Twitter for iPhone,NaT,,,,True,NaT,,,,True
27430,Samuel,"SA,Mpumalanga secunda",Life is a Gift and every day it a Celebration.,NaT,,,,True,NaT,,,,True
27431,265208E2 #BeyHive #SameLove,2014-05-03 07:38:07,129,NaT,444.0,,,True,NaT,Twitter for Android,False,,True
45326,JTKohlrieser,O-H-I-O,Don’t go around saying the world owes you a li...,NaT,,,,True,NaT,,,,True
45327,#GoBucks #GoReds #GoBrowns #CBJ,2009-06-01 17:41:19,152,NaT,13950.0,,,True,NaT,Twitter for Android,False,,True
51425,,,@PelosiLovesDJT's account is temporarily unava...,2021-01-10 04:52:00,90.0,36.0,37.0,True,2021-01-12 04:17:00,@PelosiLovesDJT's account is temporarily unava...,,Twitter for Android,False
64306,A person of little confidence but many convic...,2010-09-19 20:03:23,92,NaT,7533.0,,,True,NaT,Twitter Web App,False,,True
110775,Sickle Cell & Thal,City & Hackney,Sickle Cell & Thalassaemia Services Homerton h...,NaT,,,,True,NaT,,,,True
110776,see our website for details about the service...,2013-04-09 13:58:37,935,NaT,27.0,,,True,NaT,Twitter for iPhone,False,,True
121242,,,@farrahraja's account has been withheld in Ind...,2015-08-22 22:43:00,4989.0,1482.0,145584.0,True,2021-02-06 11:41:00,@farrahraja's account has been withheld in Ind...,,Twitter for Android,False


Many of these records contain NaN/NaT values in numerous fields including the 'text' and 'hashtags' columns which are going to be key features for our model. We'll go ahead and drop these rows since they do not contain any useful information that we could use.

&nbsp;

In [18]:
df = df.loc[~df[['user_name','date']].isnull().any(axis=1)]

In [19]:
missing_values()

Unnamed: 0,count,%
user_name,0,0.0
user_created,0,0.0
user_followers,0,0.0
user_friends,0,0.0
user_favourites,0,0.0
user_verified,0,0.0
date,0,0.0
text,0,0.0
is_retweet,0,0.0
source,2392,1.21308


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197184 entries, 0 to 328618
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_name         197184 non-null  object        
 1   user_location     155246 non-null  object        
 2   user_description  186265 non-null  object        
 3   user_created      197184 non-null  datetime64[ns]
 4   user_followers    197184 non-null  float64       
 5   user_friends      197184 non-null  float64       
 6   user_favourites   197184 non-null  float64       
 7   user_verified     197184 non-null  bool          
 8   date              197184 non-null  datetime64[ns]
 9   text              197184 non-null  object        
 10  hashtags          135574 non-null  object        
 11  source            194792 non-null  object        
 12  is_retweet        197184 non-null  bool          
dtypes: bool(2), datetime64[ns](2), float64(3), object(6)
memory

## Preprocess/clean hashtags column

In [21]:
# Drop NaN values in hashtags column
df = df.dropna(subset=['hashtags'], axis=0)

In [22]:
# Clean hashtags column
df['clean_hashtags'] = df['hashtags'].astype('str')
df['clean_hashtags'] = df['clean_hashtags'].apply(lambda x: x[1:-2]).str.replace(r"[\"\'\-\ー\_]", '').str.lower()

In [23]:
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,clean_hashtags
0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,2020-05-24 10:18:00,64.0,11.0,110.0,True,2020-08-18 12:55:00,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,False,covidvaccine
1,Shubham Gupta,,I will tell about all experiences of my life f...,2020-08-14 16:42:00,1.0,17.0,0.0,True,2020-08-18 12:55:00,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,False,"coronavirusvaccine, coronavaccine, covidvaccine"
5,Raunak Scherbatsky DankWorth,,Neuro surgeon + Diagnostician.👨‍⚕️\na good phy...,2020-03-08 13:39:00,3.0,27.0,918.0,True,2020-08-18 12:44:00,The Multi-system Inflammatory Syndrome-Childre...,"['COVID19', 'COVID19India']",Twitter for Android,False,"covid19, covid19india"
7,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,"Iconoclast, cat person, soccer fan, textile & ...",2015-07-02 07:24:00,2321.0,3236.0,264351.0,True,2020-08-18 12:30:00,"@MSNBC Well, let’s qualify that: would anyone ...",['CovidVaccine'],Twitter for iPhone,False,covidvaccine
8,Dr. Joseph Santoro,"Washington, DC 20009","Neuro PhD, #Innovator, #Technologist, #Startup...",2009-01-17 21:10:00,19091.0,20986.0,128119.0,True,2020-08-18 12:15:00,"Most countries, without the ability to make #V...",['Vaccines'],Hootsuite Inc.,False,vaccines


In [24]:
# Check/Sort unique hashtags
def clean_hashtags(data):
    tokens = [re.sub("'","",token) for token in data.values]
    clean_hashtag = ", ".join(tokens)
    clean_hashtag = clean_hashtag.split()

    while ',' in clean_hashtag:
        clean_hashtag.remove(',')
    
    clean_hashtag = ''.join(clean_hashtag)

    x = clean_hashtag.lower().split(',')
    
    print("There are {} unique hashtags: ".format(len(set(x))))
    print('')
    print(sorted(set(x)))

In [25]:
clean_hashtags(df['clean_hashtags'])

There are 27244 unique hashtags: 



We found 27,244 unique hashtags. However, there appears to be non-english words in the clean_hashtags column.   
   
Let's go ahead and remove the non-english hashtags.

&nbsp;

In [26]:
# Filter hashtags that contain ASCII characters
df = df[df['clean_hashtags'].map(lambda x: x.isascii())]

In [27]:
# Recheck: Sort unique hashtags
clean_hashtags(df['clean_hashtags'])

There are 26930 unique hashtags: 



After cleaning and filtering hashtags, we found 26,930 unique English hashtags.

&nbsp;

## Preprocess text column

In [28]:
# Text Munging
def clean_tweets(text):
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = re.sub("(?<=[a-z])'(?=[a-z])", "", text)
    text = re.sub("RT @[\w]*:","",text)
    text = re.sub("@[\w]*","",text)
    text = re.sub("\n","",text)
    text = re.sub(" +"," ",text)
    text = re.sub(r"(?<!\d)[.,;:](?!\d)"," ",text)
    text = re.sub("^\s+|\s+$", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.lower()

In [29]:
df['clean_text'] = df['text'].apply(lambda x: clean_tweets(x))

## Sample of 200 to manually label data

In [30]:
df.sample(n=200, random_state=1).to_csv('covid200sample.csv', index=False)