# Spam Mail Prediction

### Importing Dependencies

In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data Collection & Preprocessing

In [50]:
# Loading the data from csv file into a pandas dataframe
raw_mail_data = pd.read_csv('mail_data.csv')

# Replace the null values with null strings
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

### Label Encoding

In [51]:
# Labelling spam mail as 0, and ham mail as 1
mail_data.loc[mail_data['Category']=='spam', 'Category'] = 0
mail_data.loc[mail_data['Category']=='ham', 'Category'] = 1

# Separating the data as texts and label
X = mail_data['Message']
y = mail_data['Category']

### Splitting the data into training data and test data

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

### Feature Extraction

In [56]:
# Transforming the text data into feature vectors that will be used as input to the Logistic Regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Converting y_train and y_test values into integer data types
y_train = y_train.astype('int')
y_test = y_test.astype('int')

### Training the model

In [57]:
model = LogisticRegression()

# Training the logistic regression model with the training data
model.fit(X_train_features, y_train)

LogisticRegression()

### Evaluating the trained model

In [58]:
# Prediction on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)
print('Accuracy on training data: ', accuracy_on_training_data)

# Prediction on testing data
prediction_on_testing_data = model.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)
print('Accuracy on testing data: ', accuracy_on_testing_data)

Accuracy on training data:  0.9670181736594121
Accuracy on testing data:  0.9659192825112107


### Building a predictive system

In [82]:
input_msg = input('Enter the message you want to predict:')
input_msg = [input_msg]
             
# Convert text to feature vectors
input_msg_features = feature_extraction.transform(input_msg)

# Making prediction
ans = model.predict(input_msg_features)

if ans == 1:
    print('Ham mail')
else:
    print('Spam mail')

Enter the message you want to predict:PLS STOP bootydelious (32/F) is inviting you to be her friend. Reply YES-434 or NO-434 See her: www.SMS.ac/u/bootydelious STOP? Send STOP FRND to 62468
Spam mail
