In [1]:
import numpy as np
import pandas as pd

In [2]:
# Try reading the CSV file with different encodings
encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']

In [3]:
df= []
for encoding in encodings_to_try:
    try:
        df = pd.read_csv("spam.csv", encoding=encoding)
        print(f"Successfully read the file with encoding: {encoding}")
        break  # Stop trying encodings once successful
    except UnicodeDecodeError:
        print(f"Failed to read with encoding: {encoding}")


Failed to read with encoding: utf-8
Successfully read the file with encoding: latin-1


In [4]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2213,ham,"Goodmorning, today i am late for 2hrs. Because...",,,
3498,ham,I hope you arnt pissed off but id would really...,,,
904,ham,"We're all getting worried over here, derek and...",,,
3513,ham,I always chat with you. In fact i need money c...,,,
180,ham,You lifted my hopes with the offer of money. I...,,,


In [5]:
df.shape

(5572, 5)

## 1. Data Cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
# drop the last three columns
df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)

In [8]:
df.sample(5)

Unnamed: 0,v1,v2
1230,ham,I want to send something that can sell fast. ...
4915,ham,"Just dropped em off, omw back now"
2169,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, å..."
5074,spam,"This is the 2nd attempt to contract U, you hav..."
18,ham,Fine if thatåÕs the way u feel. ThatåÕs the wa...


In [9]:
# rename the columns
df.rename(columns={"v1": "target", "v2": "text"}, inplace=True)

In [10]:
df.sample(5)

Unnamed: 0,target,text
4645,spam,We tried to contact you re your reply to our o...
4475,ham,I'll pick you up at about 5.15pm to go to taun...
5188,ham,Okie
4454,ham,"Storming msg: Wen u lift d phne, u say \HELLO\..."
4220,ham,Plz note: if anyone calling from a mobile Co. ...


In [11]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [12]:
# labeling the target column
df["target"] = encoder.fit_transform(df["target"])

In [13]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# checking missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [15]:
#check for the duplicate values
df.duplicated().sum()

403

In [16]:
# remove the duplicate values
df = df.drop_duplicates(keep="first")

In [17]:
df.shape

(5169, 2)

In [18]:
df.duplicated().sum()

0

## EDA (Exploratory Data Analysis)