Notebook Setup

In [1]:
import pandas as pd
import numpy as np
import os
import re
import spacy
from IPython.display import clear_output

1. Load dataset

In [60]:
df = pd.read_csv('/content/spam.csv',encoding = 'cp1252')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [61]:
print(df.isnull().sum()/len(df)*100)
print(df.info)

v1             0.000000
v2             0.000000
Unnamed: 2    99.102656
Unnamed: 3    99.784637
Unnamed: 4    99.892319
dtype: float64
<bound method DataFrame.info of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN 

In [62]:
# Drop columns with missing values
df.drop(df.iloc[:, 2:5], axis=1, inplace=True)

In [63]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [64]:
# Let's create a column to label rows "ham": 0, "spam": 1
df = df[['v1','v2']].rename(columns={'v1':'label', 'v2':'text'})
df['label'] = df['label'].map({'ham':0, 'spam':1})

df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [75]:
df['label'].value_counts()/len(df['label'])*100

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,86.593683
1,13.406317


We have at least 13.41% of spam messages in our dataset.

We will also keep text in cased shape as it might be
an indicator for SPAM, especially promotion scams or other types of Spams.

#Preprocessing

In [77]:
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS

In [83]:
def preprocess_text(text):
  text = re.sub(r'[^a-zA-Z0-9 ]','',text.strip())
  return ' '.join([token.lemma_ for token in nlp(text) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)])

df["text"] = df["text"].apply(preprocess_text)

In [85]:
df.head()

Unnamed: 0,label,text
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar Joking wif u oni
2,1,free entry 2 wkly comp win FA Cup final tkts 2...
3,0,u dun early hor u c
4,0,nah I think usf live


In [84]:
df.to_csv('/content/data_clean.csv', index=False)