# Data Cleaning

## Import Libraries

In [1]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
from datetime import datetime as dt

import warnings

warnings.filterwarnings('ignore')

## Load Dataset 'mid_terms22.csv'

In [2]:
# load the original dataset
tweets = pd.read_csv("mid_terms22.csv")

## Data Overview

In [3]:
tweets.reset_index()

Unnamed: 0.1,index,Unnamed: 0,created_at,edit_history_tweet_ids,id,text
0,0,0,2022-10-21 19:45:25+00:00,['1583545010469113856'],1.583545e+18,"Biden touts “record” deficit reduction, critic..."
1,1,1,2022-10-21 19:45:25+00:00,['1583545006584778754'],1.583545e+18,"Sorry, I can't work on my midterms rn. I've go..."
2,2,2,2022-10-21 19:45:23+00:00,['1583545000222027776'],1.583545e+18,@AnadooFerreira @bernardokuster2 @nytimes Midt...
3,3,3,2022-10-21 19:45:20+00:00,['1583544988939431936'],1.583545e+18,@Cirincione @djrothkopf You lost the midterms ...
4,4,4,2022-10-21 19:45:19+00:00,['1583544984405692416'],1.583545e+18,@JosephH32325700 @CNNPolitics I guess everyone...
...,...,...,...,...,...,...
17332,17332,17328,2022-10-14 20:02:54+00:00,['1581012693313429504'],1.581013e+18,@shannonrwatts Republicans don't care. They ne...
17333,17333,17329,2022-10-14 20:00:17+00:00,['1581012034703216642'],1.581012e+18,My #Midterms2022 winner picks part III\nAdam F...
17334,17334,17330,2022-10-14 20:00:02+00:00,['1581011971691773953'],1.581012e+18,MT: Share your story: Why should Big Tech be h...
17335,17335,17331,2022-10-14 20:00:01+00:00,['1581011968000987136'],1.581012e+18,Umfrage zur U.S. Senatswahl in #Pennsylvania v...


In [4]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,created_at,edit_history_tweet_ids,id,text
0,0,2022-10-21 19:45:25+00:00,['1583545010469113856'],1.583545e+18,"Biden touts “record” deficit reduction, critic..."
1,1,2022-10-21 19:45:25+00:00,['1583545006584778754'],1.583545e+18,"Sorry, I can't work on my midterms rn. I've go..."
2,2,2022-10-21 19:45:23+00:00,['1583545000222027776'],1.583545e+18,@AnadooFerreira @bernardokuster2 @nytimes Midt...
3,3,2022-10-21 19:45:20+00:00,['1583544988939431936'],1.583545e+18,@Cirincione @djrothkopf You lost the midterms ...
4,4,2022-10-21 19:45:19+00:00,['1583544984405692416'],1.583545e+18,@JosephH32325700 @CNNPolitics I guess everyone...


In [5]:
tweets.tail()

Unnamed: 0.1,Unnamed: 0,created_at,edit_history_tweet_ids,id,text
17332,17328,2022-10-14 20:02:54+00:00,['1581012693313429504'],1.581013e+18,@shannonrwatts Republicans don't care. They ne...
17333,17329,2022-10-14 20:00:17+00:00,['1581012034703216642'],1.581012e+18,My #Midterms2022 winner picks part III\nAdam F...
17334,17330,2022-10-14 20:00:02+00:00,['1581011971691773953'],1.581012e+18,MT: Share your story: Why should Big Tech be h...
17335,17331,2022-10-14 20:00:01+00:00,['1581011968000987136'],1.581012e+18,Umfrage zur U.S. Senatswahl in #Pennsylvania v...
17336,17332,2022-10-14 19:57:45+00:00,['1581011395780304896'],1.581011e+18,My #Midterms2022 winner picks part II\nSarah H...


In [6]:
tweets.shape

(17337, 5)

Observation:
- There are 17337 observatioins/rows in the dataset.
- There are 5 columns in the dataset.

In [7]:
tweets[tweets.duplicated()].count()

Unnamed: 0                2
created_at                0
edit_history_tweet_ids    0
id                        0
text                      0
dtype: int64

Observation:
- There are two duplicate entries.
- We will drop the duplicate entries to maintain date integrity.

### Drop Duplicates

In [8]:
tweets = tweets.drop_duplicates()

In [9]:
tweets.shape

(17335, 5)

Observation:
- Now we have 17,335 observations/rows.

In [10]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17335 entries, 0 to 17336
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              17335 non-null  object 
 1   created_at              17333 non-null  object 
 2   edit_history_tweet_ids  17333 non-null  object 
 3   id                      17333 non-null  float64
 4   text                    17333 non-null  object 
dtypes: float64(1), object(4)
memory usage: 812.6+ KB


Observation:
- We don't need the unnamed column, we will drop it.
- We dont need the edit_history_tweets_ids.
- We need to change the created_at coumn to a date type column.

### Drop Columns

In [11]:
tweets.drop("Unnamed: 0", axis=1, inplace=True)

In [12]:
tweets.drop("edit_history_tweet_ids", axis=1, inplace=True)

In [13]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17335 entries, 0 to 17336
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   created_at  17333 non-null  object 
 1   id          17333 non-null  float64
 2   text        17333 non-null  object 
dtypes: float64(1), object(2)
memory usage: 541.7+ KB


### Change Data Type

In [14]:
tweets['created_at']

0        2022-10-21 19:45:25+00:00
1        2022-10-21 19:45:25+00:00
2        2022-10-21 19:45:23+00:00
3        2022-10-21 19:45:20+00:00
4        2022-10-21 19:45:19+00:00
                   ...            
17332    2022-10-14 20:02:54+00:00
17333    2022-10-14 20:00:17+00:00
17334    2022-10-14 20:00:02+00:00
17335    2022-10-14 20:00:01+00:00
17336    2022-10-14 19:57:45+00:00
Name: created_at, Length: 17335, dtype: object

In [15]:
tweets['created_at'] = pd.to_datetime(tweets['created_at'])

In [16]:
tweets['created_at'] = pd.to_datetime(tweets['created_at']).dt.date

In [17]:
tweets['created_at']

0        2022-10-21
1        2022-10-21
2        2022-10-21
3        2022-10-21
4        2022-10-21
            ...    
17332    2022-10-14
17333    2022-10-14
17334    2022-10-14
17335    2022-10-14
17336    2022-10-14
Name: created_at, Length: 17335, dtype: object

In [18]:
tweets['created_at'].unique()

array([datetime.date(2022, 10, 21), datetime.date(2022, 10, 20),
       datetime.date(2022, 10, 19), datetime.date(2022, 10, 18),
       datetime.date(2022, 10, 17), datetime.date(2022, 10, 16),
       datetime.date(2022, 10, 15), NaT, datetime.date(2022, 10, 14)],
      dtype=object)

Observation:
- The data type for the variable created_at has been changed to datetime.
- The time in the column created_at has also been deleted, we only now have the date.

### Drop Missing Values

In [19]:
tweets.isnull().sum()

created_at    2
id            2
text          2
dtype: int64

Observation:
- There are only two missing dates in the data.
- We will drop these two rows.

In [20]:
tweets[tweets['created_at'].isnull()]

Unnamed: 0,created_at,id,text
8264,NaT,,
8265,NaT,,


In [21]:
tweets = tweets.dropna()

In [22]:
tweets.isnull().sum()

created_at    0
id            0
text          0
dtype: int64

Observation:
- We have no missing values.
- We have only the most important columns.
- We also changed the data type of created_at to date and removed the time.

## Text Pre-Processing 

In [23]:
#remove all non-aphabet characters
tweets['text'] = tweets['text'].str.replace("[^a-zA-Z#]", ' ', regex=True)

In [24]:
#covert to lower-case
tweets['text'] = tweets['text'].str.casefold()

In [25]:
tweets['text'].tail(10)

17327    nothing would make me happier than to see #eli...
17328    #voteredtosaveamerica #midterms     https   t ...
17329    the hs class of      scores lowest on act exam...
17330    my opponent s nickname  moscow on the sacramen...
17331    most republicans in congress  some  secretserv...
17332     shannonrwatts republicans don t care  they ne...
17333    my #midterms     winner picks part iii adam fr...
17334    mt  share your story  why should big tech be h...
17335    umfrage zur u s  senatswahl in #pennsylvania v...
17336    my #midterms     winner picks part ii sarah hu...
Name: text, dtype: object

In [26]:
tweets['text'] = tweets['text'].str.replace(r'#', '', regex=True)

In [27]:
tweets['text'].head(10)

0    biden touts  record  deficit reduction  critic...
1    sorry  i can t work on my midterms rn  i ve go...
2     anadooferreira  bernardokuster   nytimes midt...
3     cirincione  djrothkopf you lost the midterms ...
4     josephh          cnnpolitics i guess everyone...
5    dr  biden  wants women to know how high the st...
6    you can bet that since dems sense impending lo...
7     repadamschiff lying schiff pops his head up  ...
8    many voters are stupid and mean assholes  davi...
9     perpetualmaniac the midterms are coming and p...
Name: text, dtype: object

In [28]:
tweets.reset_index(drop=True)

Unnamed: 0,created_at,id,text
0,2022-10-21,1.583545e+18,biden touts record deficit reduction critic...
1,2022-10-21,1.583545e+18,sorry i can t work on my midterms rn i ve go...
2,2022-10-21,1.583545e+18,anadooferreira bernardokuster nytimes midt...
3,2022-10-21,1.583545e+18,cirincione djrothkopf you lost the midterms ...
4,2022-10-21,1.583545e+18,josephh cnnpolitics i guess everyone...
...,...,...,...
17328,2022-10-14,1.581013e+18,shannonrwatts republicans don t care they ne...
17329,2022-10-14,1.581012e+18,my midterms winner picks part iii adam fri...
17330,2022-10-14,1.581012e+18,mt share your story why should big tech be h...
17331,2022-10-14,1.581012e+18,umfrage zur u s senatswahl in pennsylvania vo...


## Create Clean Dataset CSV

In [30]:
tweets.to_csv("clean_mid_terms22.csv")