## Spam Mail Detection

### Installing the important libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data collection and Preprocessing

In [3]:
data=pd.read_csv('mail_data.csv')

In [4]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.dtypes

Category    object
Message     object
dtype: object

In [6]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
data.count()

Category    5572
Message     5572
dtype: int64

In [9]:
data.shape

(5572, 2)

In [10]:
data.describe

<bound method NDFrame.describe of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

### Label Encoding

### Label spam mail as 0; and non-spam mail as 1

In [11]:
data.loc[data['Category']=='spam','Category',]=0

In [12]:
data.loc[data['Category']=='ham','Category',]=1

In [13]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
x=data['Message']
y=data['Category']

### Splitting the Data

In [20]:
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state=3,test_size=0.2)

In [21]:
x_test.shape

(1115,)

In [22]:
x_train.shape

(4457,)

In [23]:
y_train.shape

(4457,)

In [24]:
y_test.shape

(1115,)

### Feature Extraction

### Transforming the data into feature vectors that can be used as input to logistics regression model

In [25]:
feature_extraction=TfidfVectorizer(min_df=1,lowercase=True,stop_words='english')

In [26]:
feature_extraction

TfidfVectorizer(stop_words='english')

In [30]:
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

In [31]:
x_train_features

<4457x7458 sparse matrix of type '<class 'numpy.float64'>'
	with 34768 stored elements in Compressed Sparse Row format>

In [32]:
x_test_features

<1115x7458 sparse matrix of type '<class 'numpy.float64'>'
	with 7728 stored elements in Compressed Sparse Row format>

In [33]:
y_test=y_test.astype('int')
y_train=y_train.astype('int')

### Training the model

In [34]:
model=LogisticRegression()

In [35]:
model.fit(x_train_features,y_train)

LogisticRegression()

### Evaluating the trained model

In [39]:
Predict_training_data=model.predict(x_train_features)

In [40]:
Predict_training_data

array([1, 1, 1, ..., 0, 1, 1])

In [41]:
accuracy_tarining=accuracy_score(y_train,Predict_training_data)

In [42]:
accuracy_tarining

0.9683643706529056

In [43]:
Predict_test_data=model.predict(x_test_features)

In [44]:
accuracy_test=accuracy_score(y_test,Predict_test_data)

In [45]:
accuracy_test

0.9524663677130045