In [1]:
# dependencies
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer  #for text reading and extraction
# to convert mail data(text data) into numerical values
from sklearn.metrics import accuracy_score  #used to evaluate our model.. how well our model is performing
from sklearn.preprocessing import LabelEncoder

In [2]:
df=pd.read_csv("mail_data.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df=df.where((pd.notnull(df)),'')  #to fill empty string in place of null values in message column.

In [4]:
df.shape #total no of rows and column..

(5572, 2)

In [5]:
# label encoding
le=LabelEncoder()
df['Category']=le.fit_transform(df['Category'])
# spam=1
# ham=0

In [6]:
df.head()
# feature extraction
# transform the text data to feature vector that can be used as input in logestic regression model
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [7]:
x=feature_extraction.fit_transform(df['Message'])
y=df['Category']
y=y.astype('int') #converting y to int

In [8]:
x_train,x_test,y_train,y_test=tts(x,y,test_size=0.2,random_state=104,shuffle=True)

In [9]:
# training logistic regression model
lr=LogisticRegression()
lr.fit(x_train,y_train)

LogisticRegression()

In [10]:
# predicting from model
array=lr.predict(x_train)

In [11]:
accuracy=accuracy_score(y_train,array)
accuracy

0.9694862014808167

In [12]:
lr.score(x_test,y_test)

0.9497757847533632

In [13]:
# building a predictive system
mails=["Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"]
find=feature_extraction.transform(mails) #here we are using transform insted of fit_traanform because we dont want it to fit.

In [152]:
print(lr.predict(find))

[1]
