In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [36]:
raw_mail_data = pd.read_csv('./mail_data.csv')

In [37]:
df = raw_mail_data.where((pd.notnull(raw_mail_data)),'') #replacing null values with null string

In [38]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [40]:
df.shape

(5572, 2)

In [41]:
#label encoding
df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'ham', 'Category',] = 1

In [42]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [44]:
df['Category']

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [45]:
X = df['Message']

Y = df['Category']

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [47]:
Y_train.info()

<class 'pandas.core.series.Series'>
Index: 4457 entries, 1978 to 860
Series name: Category
Non-Null Count  Dtype 
--------------  ----- 
4457 non-null   object
dtypes: object(1)
memory usage: 69.6+ KB


In [48]:
Y_train.value_counts()

Category
1    3859
0     598
Name: count, dtype: int64

In [49]:
#vectorizing
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)


X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)


# converting Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [50]:
model = LogisticRegression()

In [51]:
model.fit(X_train_features, Y_train)

In [52]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [53]:
print(accuracy_on_training_data)

0.9670181736594121


In [54]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [55]:
print(accuracy_on_test_data)

0.967713004484305
