In [130]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


In [131]:
dataset=pd.read_csv("mail_data.csv")

In [132]:
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [133]:
dataset.shape

(5572, 2)

In [134]:
dataset.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [135]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [136]:
dataset["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [137]:
dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [138]:
dataset=dataset.where((pd.notnull(dataset)),'')#this means the null value is replaced by the empty string

In [139]:
dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [140]:
dataset['Category'].replace({"ham": 1, "spam": 0}, inplace=True)


In [141]:
dataset.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [142]:
X=dataset["Message"]
Y=dataset["Category"]

In [143]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [144]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=3)

In [145]:
tdf=TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)

In [146]:
X_train_features=tdf.fit_transform(X_train)


Y_train.astype('int')
Y_test.astype('int')

793     1
4553    1
2692    1
5231    1
4698    1
       ..
1632    1
2836    1
173     1
4784    1
3619    1
Name: Category, Length: 1115, dtype: int32

In [147]:
X_test_features=tdf.transform(X_test)

In [148]:
classo=LogisticRegression()

In [149]:
classo.fit(X_train_features,Y_train)

In [150]:
X_train_features_pred=classo.predict(X_train_features)

In [151]:
accuracy_score(X_train_features_pred,Y_train)

0.9667938074938299

In [152]:
X_test_features_pred=classo.predict(X_test_features)

In [153]:
accuracy_score(X_test_features_pred,Y_test)

0.9713004484304932

In [154]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = tdf.transform(input_mail)

# making prediction

prediction = classo.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail
