# 1. Import the dataset into a pandas DataFrame

In [129]:
import pandas as pd

In [130]:
df = pd.read_csv('/content/spam.csv',encoding='latin-1')

# 2. Print the shape of the dataset

In [131]:
df.shape

(5572, 5)

# 3. Observe the 1st few rows of the dataset

In [132]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


# 4. Remove any null values

In [133]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [134]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df.isnull().sum()

v1    0
v2    0
dtype: int64

# 5. Rename the columns v1 and v2 appropriately

In [135]:
df.rename(columns={'v1':'label','v2':'text'},inplace=True)
df.sample(10)

Unnamed: 0,label,text
1698,spam,"Free msg. Sorry, a service you ordered from 81..."
5425,ham,Otherwise had part time job na-tuition..
4405,spam,As one of our registered subscribers u can ent...
2325,ham,Apps class varaya elaya.
498,ham,"Kate jackson rec center before 7ish, right?"
151,ham,Yup i thk cine is better cos no need 2 go down...
4430,ham,2mro i am not coming to gym machan. Goodnight.
5399,ham,And he's apparently bffs with carly quick now
3057,spam,You are now unsubscribed all services. Get ton...
1309,ham,"Ok, be careful ! Don't text and drive !"


#Applying label ecoder to the labels

In [136]:
from sklearn.preprocessing import LabelEncoder

In [137]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

# 6. Convert text of all sms to lowercase, remove urls, remove punctuation marks and special characters, remove stopwords

In [139]:
X= df['text']

In [140]:
# Removing URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)
    return text

In [141]:
# Removing  Punctuation Marks/Special Characters
import re

def remove_punctuation_and_special_chars(text):

    # Remove special characters
    text = re.sub('[^A-Za-z0-9 ]+', '', text)

    return text

In [142]:
import nltk

In [143]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [144]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [145]:
# Imports the nltk's prebuilt tokenizers
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [146]:
def text_cleaning(text):
  text = text.lower()
  text = remove_punctuation_and_special_chars(text)
  text = remove_urls(text)
  words= word_tokenize(text)
  filtered_text= [word for word in words if word.lower() not in stopwords.words('english')]
  text = ' '.join(filtered_text)
  return text


X= X.apply(text_cleaning)
X[:5]

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: text, dtype: object

# 7. Reduce text to their lemma form

In [147]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() ## Create object for lemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [148]:
# Function to lemmatize a single text message
def lemmatize_text(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Join the lemmatized words back into a single string
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text


In [149]:
df['text_lemmatized'] = df['text'].apply(lemmatize_text)

In [150]:
df.sample(5)

Unnamed: 0,label,text,text_lemmatized
2564,ham,"Under the sea, there lays a rock. In the rock,...","Under the sea , there lay a rock . In the rock..."
4024,ham,&lt;#&gt; in mca. But not conform.,& lt ; # & gt ; in mca . But not conform .
4098,ham,If u dun drive then how i go 2 sch.,If u dun drive then how i go 2 sch .
4146,ham,Pls help me tell sura that i'm expecting a bat...,Pls help me tell sura that i 'm expecting a ba...
1394,ham,R we still meeting 4 dinner tonight?,R we still meeting 4 dinner tonight ?


# 8. Transform the messages into Vectors using TF-IDF

In [158]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [159]:
tf_idf= TfidfVectorizer()
transformed_tf_idf= tf_idf.fit_transform(X)

In [160]:
transformed_tf_idf.toarray()[2,100:200]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [161]:
transformed_tf_idf = transformed_tf_idf.toarray()

# 9. Split the data into train and test sets for each of the vector representation.
# Test set should be 20% of the entire dataset.

In [162]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(transformed_tf_idf,y,test_size=0.2, random_state=51)

# 10. Use any two sklearn models for classification and generate their classification reports

In [166]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
NB_clf = MultinomialNB()
NB_clf.fit(X_train, Y_train)

# Predicting on the test set
y_pred = NB_clf.predict(X_test)

# Evaluating the model
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Classification Report:")
print(classification_report(Y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.9632286995515695
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       962
        spam       1.00      0.73      0.85       153

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [167]:
from sklearn.svm import SVC
svc_clf = SVC()
svc_clf.fit(X_train,Y_train)
svc_clf.score(X_test,Y_test)

0.968609865470852