In [76]:
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Read the dataset

In [45]:
df = pd.read_csv("mail_data.csv")

In [46]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocess text

In [47]:
def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in  stopwords.words('english')]
    return " ".join(text)

In [48]:
x = df.loc[:, 'Message'].values
y = df.loc[:, 'Category'].values

In [49]:
x[0:5]

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       'U dun say so early hor... U c already then say...',
       "Nah I don't think he goes to usf, he lives around here though"],
      dtype=object)

In [50]:
for i, text in enumerate(x):
    new_text = text_preprocess(text)
    x[i] = new_text

In [51]:
x[0:5]

array(['Go jurong point crazy Available bugis n great world la e buffet Cine got amore wat',
       'Ok lar Joking wif u oni',
       'Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 Text FA 87121 receive entry questionstd txt rateTCs apply 08452810075over18s',
       'U dun say early hor U c already say',
       'Nah dont think goes usf lives around though'], dtype=object)

In [55]:
vectorizer = TfidfVectorizer(stop_words="english")

In [56]:
x = vectorizer.fit_transform(x)

In [58]:
le = LabelEncoder()
y = le.fit_transform(y)

In [60]:
y[0:5]

array([0, 0, 1, 0, 0])

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8)

# Train the models

### 1- Logistic Regression

In [64]:
lr = LogisticRegression(solver='liblinear', penalty='l1')

In [65]:
lr.fit(x_train, y_train)

In [69]:
y_pred = lr.predict(x_test)
print("Accuracy = {}".format(accuracy_score(y_test, y_pred)))

Accuracy = 0.911395244504262


### 2- Decision Tree

In [71]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [72]:
y_pred = dt.predict(x_test)
print("Accuracy = {}".format(accuracy_score(y_test, y_pred)))

Accuracy = 0.9466128308658591


### 3- Random Forest

In [77]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [78]:
y_pred = rf.predict(x_test)
print("Accuracy = {}".format(accuracy_score(y_test, y_pred)))

Accuracy = 0.9434724091520862
