## Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Downloading data

In [2]:
raw_mail_data  = pd.read_csv('Datasets/mail_data.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail_data.shape

(5572, 2)

In [5]:
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
mail_data.Message.nunique()

5157

In [8]:
mail_data.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

## Label Encoding

`Spam - 0` `Ham - 1`

In [9]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
X = mail_data.Message
y = mail_data.Category

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)

In [12]:
X.shape, X_train.shape, X_test.shape

((5572,), (4457,), (1115,))

## Feature Extraction

In [13]:
# Transfrom text data to feature vectors
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = 'True')

In [14]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [15]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

## Model Training

In [16]:
classifier = LogisticRegression()
classifier.fit(X_train_features, y_train)

## Model Evaluation

In [17]:
train_pred = classifier.predict(X_train_features)
train_accuracy = accuracy_score(y_train, train_pred)
print('Accuracy of training data: ',train_accuracy)

Accuracy of training data:  0.9670181736594121


In [18]:
test_pred = classifier.predict(X_test_features)
test_accuracy = accuracy_score(y_test, test_pred)
print('Accuracy of test data: ',test_accuracy)

Accuracy of test data:  0.9659192825112107


## Building a predictive system

In [19]:
input_data = ['Hello I am Aditi. Where are you these days. Come home soon.']
input_data_features = feature_extraction.transform(input_data)
prediction = classifier.predict(input_data_features)
print(prediction)
if prediction[0] == 0:
    print('Spam')
else:
    print('Ham')

[1]
Ham
