# This notebook for preprocessing the data and explore it

## 1. Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Loading the data

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

## 3. Exploring the data

In [3]:
df_train.head()

Unnamed: 0,ID,title,text,subject,date,class
0,0.0,#AfterTrumpImplodes Hashtag Hilariously Imagi...,What will the world be like post-Donald Trump?...,News,5-Aug-16,0.0
1,1.0,#BlackLivesMatter Leader To Run For Mayor Of ...,The police shooting of black teen Michael Brow...,News,4-Feb-16,0.0
2,2.0,#BringBackObama Hashtag Blows Up On Twitter A...,The six months since President Donald Trump wa...,News,13-Jul-17,0.0
3,3.0,#FreeChrisChristie: Twitter Reacts To The ‘Ho...,"Last Friday, New Jersey Governor Chris Christi...",News,2-Mar-16,0.0
4,4.0,#MakeAmericaBrannigan: Futurama Voice Actor R...,"The incredibly talented voice actor, Billy Wes...",News,13-Aug-16,0.0


In [4]:
df_train.describe()

Unnamed: 0,ID,class
count,40406.0,40414.0
mean,20202.5,0.473672
std,11664.351825,0.499313
min,0.0,0.0
25%,10101.25,0.0
50%,20202.5,0.0
75%,30303.75,1.0
max,40405.0,1.0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44919 entries, 0 to 44918
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       40406 non-null  float64
 1   title    40427 non-null  object 
 2   text     40427 non-null  object 
 3   subject  40414 non-null  object 
 4   date     40414 non-null  object 
 5   class    40414 non-null  float64
dtypes: float64(2), object(4)
memory usage: 2.1+ MB


## 4. Preprocessing the data

### 4.1. Handling duplicates

In [6]:
# Check for duplicates
duplicated_rows = df_train[df_train.duplicated()]
print(duplicated_rows)

       ID title text subject date  class
40428 NaN   NaN  NaN     NaN  NaN    NaN
40429 NaN   NaN  NaN     NaN  NaN    NaN
40430 NaN   NaN  NaN     NaN  NaN    NaN
40431 NaN   NaN  NaN     NaN  NaN    NaN
40432 NaN   NaN  NaN     NaN  NaN    NaN
...    ..   ...  ...     ...  ...    ...
44914 NaN   NaN  NaN     NaN  NaN    NaN
44915 NaN   NaN  NaN     NaN  NaN    NaN
44916 NaN   NaN  NaN     NaN  NaN    NaN
44917 NaN   NaN  NaN     NaN  NaN    NaN
44918 NaN   NaN  NaN     NaN  NaN    NaN

[4491 rows x 6 columns]


In [7]:
# drop duplicates
df_train = df_train.drop_duplicates()

### 4.2. Handling missing values

In [8]:
df_train.isnull().sum()

ID         22
title       1
text        1
subject    14
date       14
class      14
dtype: int64

#### 4.2.1. All the row with missing values

In [9]:
# If the a row is missing all the columns, we can drop it
df_train = df_train.dropna(how='all')
print(df_train.isnull().sum())

ID         21
title       0
text        0
subject    13
date       13
class      13
dtype: int64


#### 4.2.2. class

In [10]:
# drop rows with missing values in the target column
df_train = df_train.dropna(subset=['class'])
print(df_train.isnull().sum())

ID         21
title       0
text        0
subject     0
date        0
class       0
dtype: int64


#### 4.2.3. ID

In [11]:
# filling the ID column with the value of the index
df_train['ID'] = df_train.index
print(df_train.isnull().sum())

ID         0
title      0
text       0
subject    0
date       0
class      0
dtype: int64


### 4.3 Handling date data

In [16]:
# Date will not be useful for the model, so we can drop it as the date of the new won't determine if the new is fake or not
df_train = df_train.drop(columns=['date'])
df_test = df_test.drop(columns=['date'])

### 4.4. Saving the preprocessed data

In [17]:
# save the cleaned data
df_train.to_csv('preprocessed_data/train_cleaned.csv', index=False)
df_test.to_csv('preprocessed_data/test_cleaned.csv', index=False)