# **Data Preprocessing and EDA**

## **Data Preprocessing**

In [1]:
# importing all required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# reading dataset

df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [None]:
# shape of datset
df.shape

(37249, 2)

There are 37k+ rows in our dataset and two columns

In [None]:
# checking info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [6]:
# checking for null values
df.isna().sum()

clean_comment    100
category           0
dtype: int64

In [8]:
(df.isna().sum()/df.shape[0])*100

clean_comment    0.268464
category         0.000000
dtype: float64

There are only 0.27% of null values in our 'clean_comment' column so we can safely remove these null values

In [9]:
df[df['clean_comment'].isna()]

Unnamed: 0,clean_comment,category
413,,0
605,,0
2422,,0
2877,,0
3307,,0
...,...,...
35975,,0
36036,,0
37043,,0
37111,,0


In [None]:
# dropping all null values
df.dropna(inplace=True)

In [None]:
# checking for null values in new dataset
df.isna().sum()

clean_comment    0
category         0
dtype: int64

In [None]:
# checking for duplicate values
df.duplicated().sum()

np.int64(350)

There are 350 duplicate values in our dataset and we will have to remove remove it

In [14]:
# dropping duplicate values
df.drop_duplicates(inplace=True)

In [15]:
# checking if there is any duplicate values in new dataset
df.duplicated().sum()

np.int64(0)

In [16]:
# checking for empty strings in clean_comment sections
df[df['clean_comment'].str.strip()=='']

Unnamed: 0,clean_comment,category
181,,0
4432,\n,0
10592,,0
16173,,0
32149,\n,0
34959,,0


In [18]:
# we will have to remove these datapoints too
df = df[~(df['clean_comment'].str.strip()=='')]

In [None]:
# since text data can be case sensitive, make all text to lowercase and also remove all spaces
df['clean_comment'] = df['clean_comment'].str.strip()
df['clean_comment'] = df['clean_comment'].str.lower()

In [26]:
df[df['clean_comment'].apply(lambda x: x.startswith(' ') or x.endswith(' '))]

Unnamed: 0,clean_comment,category


So there is no comment with trailing and starting with spaces

In [None]:
# checking for comment containing urls
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
df['clean_comment'].str.contains(url_pattern, regex=True).sum()

np.int64(0)

There is no comment containing urls

In [31]:
# check for comment containing new line characters
df['clean_comment'].str.contains('\n').sum()

np.int64(204)

There are 204 comments containing new line charactor, we will have to replace this charactor with space.

In [32]:
df['clean_comment'] = df['clean_comment'].str.replace('\n', ' ', regex=True)

In [33]:
# check for comment containing new line characters in new dataset
df['clean_comment'].str.contains('\n').sum()

np.int64(0)