## Imports
Importing important libraries

In [256]:
import pandas as pd
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import re

In [257]:
# Importing data
data = pd.read_csv('SMSSpamCollection',sep='\t',names= ['labes','message'])
data.head()

Unnamed: 0,labes,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [258]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   labes    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


There is no null values in the data

### Cleaning the data

First we need to clean the data
- lower case the data
- Remove punctuations
- Remove stop words
- apply steaming

In [259]:
stop_words = stopwords.words('english')
ps = PorterStemmer()

In [260]:
def clean_data(text):
    # Removing punctuations and other unnessacery words
    review = re.sub('[^a-zA-Z]',' ', text)
    # Lowering data
    review = review.lower()
    
    #Removing stop words and applying steaming
    review = ' '.join([ps.stem(word) for word in review.split() if word not in stopwords.words('english')])
        
    return review
    

In [261]:
data['message'] = data['message'].apply(clean_data)

### Spliting data
Now we need to separate features and target from the data and then split it into train test

In [262]:
X = data['message']
y = data['labes']

In [293]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [294]:
print('shape of X_train is:', X_train.shape)
print('shape of X_test is:', X_test.shape)
print('\nShape of y_train is:',y_train.shape)
print('Shape of y_test is:', y_test.shape)

shape of X_train is: (3733,)
shape of X_test is: (1839,)

Shape of y_train is: (3733,)
Shape of y_test is: (1839,)


### Extracting features 

Now we need to extract features from data and feed it to model to train.

In [295]:
count_vectorizer =CountVectorizer()
le = LabelEncoder()

In [297]:
X_train_features = count_vectorizer.fit_transform(X_train)
X_test_features = count_vectorizer.transform(X_test)

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

### Training model
We will use logestic regression which is best for binary classification if the result it not good we will use other models

In [283]:
model = LogisticRegression()
model.fit(X_train_features,y_train)

LogisticRegression()

### Evaluating the model

In [284]:
train_pred = model.predict(X_train_features)
test_pred = model.predict(X_test_features)

In [285]:
accuracy_score(y_train,train_pred)

0.996249665148674

In [286]:
accuracy_score(y_test,test_pred)

0.9831430125067971

### Test you own message

In [310]:
message = 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli'

#
message = clean_data(message)

message = count_vectorizer.transform([message])
pred = model.predict(message)

if pred ==[0]:
    print('This is not a spam message')
else:
    print('This is a spam message')

This is a spam message
