<a href="https://colab.research.google.com/github/ahmadsakir/Kaggle-Notebooks/blob/main/Spam_SMS_Classification_97%25_.%20ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv('/content/drive/MyDrive/upload/Spam_SMS.csv')
df.head()

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5574 non-null   object
 1   Message  5574 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [12]:
df['label'] = df['Class']

In [13]:
df.drop('Class', axis=1, inplace=True)

In [14]:
df['label'] = df['label'].map({'ham':0, 'spam':1})

In [15]:
df.head()

Unnamed: 0,Message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [16]:
! pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [19]:
import re
import spacy
import emoji
import string

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)   # Remove URLs
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#', '', text)   # remove hashtags
    text = re.sub(r'\S+@\S+', '', text)   # remove emails
    text = re.sub(r'&\w+;', '', text)    # remove html's
    text = emoji.replace_emoji(text, replace='')    # remove emojis
    text = text.translate(str.maketrans('', '', string.punctuation))   # remove punctuations
    text = re.sub(r'\s+', ' ', text).strip()   # remove extra spaces
    text = re.sub(r'\d+', '', text)   # remove digits
    return text

df['CleanMsg'] = df['Message'].apply(clean_text)

In [20]:
nlp = spacy.load('en_core_web_sm')

def lemmatize_txt(text):

  doc = nlp(text)
  lemmas = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

  return " ".join(lemmas)

df['Msg'] = df['CleanMsg'].apply(lemmatize_txt)

In [21]:
df.drop(['Message', 'CleanMsg'], axis=1, inplace=True)

In [22]:
df.head(10)

Unnamed: 0,label,Msg
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar Joking wif u oni
2,1,free entry wkly comp win FA Cup final tkts st ...
3,0,u dun early hor u c
4,0,Nah not think go usf live
5,1,FreeMsg hey darle week word d like fun Tb ok X...
6,0,brother like speak treat like aids patent
7,0,request Melle Melle Oru Minnaminunginte Nurung...
8,1,WINNER value network customer select receivea ...
9,1,mobile month u r entitle update late colour mo...


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Msg'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

In [26]:
y_pred = lr.predict(X_test_tfidf)

In [27]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.9730941704035875
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       0.99      0.81      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

[[965   1]
 [ 29 120]]


In [28]:
# Example prediction
def predict_spam(text):
    cleaned = clean_text(text)
    lemmatized = lemmatize_txt(cleaned)
    X = tfidf.transform([lemmatized])
    pred = lr.predict(X)[0]
    return 'spam' if pred == 1 else 'ham'

print(predict_spam("Congratulations! You've won £1000!"))

ham
