In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
X_train_df = pd.read_csv('train.csv')
X_test_df = pd.read_csv('test.csv')
X_val_df = pd.read_csv('validation.csv')

In [16]:
tf_idf = TfidfVectorizer()

In [17]:
X_train_df

Unnamed: 0,Email,Spam
0,gas model chaim received number phone messages...,0
1,backtesting different percentiles vlady enclos...,0
2,year end performance feedback note receive mes...,0
3,neugierig content type text plain content tran...,1
4,garp frank reviewed materials garp find inform...,0
...,...,...
3412,friend mine shirley please arrange phone inter...,0
3413,var cob nd aug hi vince waiting comment email ...,0
3414,willow pathstar evaluations please respond mik...,0
3415,cool medz hello welcome medzonli decapitation ...,1


In [18]:
X_train = tf_idf.fit_transform(X_train_df['Email']).toarray()
y_train = X_train_df['Spam']

In [19]:
X_test_df

Unnamed: 0,Email,Spam
0,forwarded vince j kaminski hou ect gould aaron...,0
1,new love tabs shop visit llcensed online drags...,1
2,additional e mail addresses vince three new st...,0
3,synfuel option valuation lenny believe must do...,0
4,missing prc information vince following inform...,0
...,...,...
1134,congratulations dear vince soooo gland see get...,0
1135,additional attachments vince forgot attach fin...,0
1136,visit enron professor nalin kulatilaka boston ...,0
1137,subscription renewal barbara yes would like re...,0


In [20]:
X_test = tf_idf.transform(X_test_df['Email']).toarray()
y_test = X_test_df['Spam']

In [21]:
X_val_df

Unnamed: 0,Email,Spam
0,fyi enron best hi vince spoke molly mcgee hr g...,0
1,fw gmm mar jeff newsletter addressed wide audi...,0
2,russian investment climate multimedia playback...,0
3,professional advertising dear projecthoneypot ...,1
4,supply rebound beginning update cera outlook u...,0
...,...,...
1134,new eprm speakers vince thanks much help helen...,0
1135,technical writer position note confirm cease r...,0
1136,calling pm pm hi vince thank allowing call spe...,0
1137,fw london wish list oops sent previous email a...,0


In [22]:
X_val = tf_idf.transform(X_val_df['Email']).toarray()
y_val = X_val_df['Spam']

# Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score

result_df = pd.DataFrame(columns = ['lambda', 'precision', 'recall'])

for lam in np.arange(0.1, 1, 0.1):
    model = LogisticRegression(penalty = 'elasticnet', l1_ratio = lam, solver = 'saga', max_iter = 1000)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    
    result_df = pd.concat([result_df, pd.DataFrame({'lambda': [lam], 'precision': [precision], 'recall': [recall]})], ignore_index=True)

print("DataFrame containing lambda, precision, and recall:")
result_df

DataFrame containing lambda, precision, and recall:


Unnamed: 0,lambda,precision,recall
0,0.1,0.988235,0.903226
1,0.2,0.980392,0.896057
2,0.3,0.980315,0.892473
3,0.4,0.980392,0.896057
4,0.5,0.972973,0.903226
5,0.6,0.965779,0.910394
6,0.7,0.954887,0.910394
7,0.8,0.951128,0.90681
8,0.9,0.940741,0.910394


# Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier

result_df = pd.DataFrame(columns = ['max_depth', 'precision', 'recall'])

for max_depth in range(5,90,10):
    clf = DecisionTreeClassifier(max_depth = max_depth, random_state = 52)
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_val)
    
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    
    result_df = pd.concat([result_df, pd.DataFrame({'max_depth': [max_depth], 'precision': [precision], 'recall': [recall]})], ignore_index=True)


print("DataFrame containing max_depth, precision, and recall:")
result_df

DataFrame containing max_depth, precision, and recall:


Unnamed: 0,max_depth,precision,recall
0,5,0.728767,0.953405
1,15,0.82243,0.946237
2,25,0.875839,0.935484
3,35,0.870748,0.917563
4,45,0.887719,0.90681
5,55,0.895105,0.917563
6,65,0.895105,0.917563
7,75,0.895105,0.917563
8,85,0.895105,0.917563


# SVM

In [24]:
from sklearn.svm import SVC

C_values = [0.1, 1, 10, 100, 1000]

result_df = pd.DataFrame(columns = ['C', 'precision', 'recall'])

for C in C_values:
    clf = SVC(C = C, kernel = 'rbf', random_state = 44)
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_val)
    
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    
    result_df = pd.concat([result_df, pd.DataFrame({'C_value': [C], 'precision': [precision], 'recall': [recall]})], ignore_index=True)

print("DataFrame containing C, precision, and recall:")
result_df


DataFrame containing C, precision, and recall:


Unnamed: 0,C,precision,recall,C_value
0,,1.0,0.222222,0.1
1,,0.988848,0.953405,1.0
2,,0.988889,0.956989,10.0
3,,0.988889,0.956989,100.0
4,,0.988889,0.956989,1000.0


# Model Evaluation

In [26]:
decision_tree_model = DecisionTreeClassifier(max_depth = 55, random_state = 52)
decision_tree_model.fit(X_train,y_train)

svm_model = SVC(C = 1.0, kernel = 'rbf', random_state = 44)
svm_model.fit(X_train, y_train)

logistic_regression_model = LogisticRegression(penalty = 'elasticnet', l1_ratio = 0.1, solver = 'saga', max_iter = 1000)
logistic_regression_model.fit(X_train, y_train)

LogisticRegression(l1_ratio=0.1, max_iter=1000, penalty='elasticnet',
                   solver='saga')

In [28]:
decision_tree_pred = decision_tree_pred = decision_tree_model.predict(X_test)
svm_pred = svm_model.predict(X_test)
logistic_regression_pred = logistic_regression_model.predict(X_test)

# Calculate precision and recall for each model
decision_tree_precision = precision_score(y_test, decision_tree_pred)
decision_tree_recall = recall_score(y_test, decision_tree_pred)

svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)

logistic_regression_precision = precision_score(y_test, logistic_regression_pred)
logistic_regression_recall = recall_score(y_test, logistic_regression_pred)

# Create a DataFrame to store model names, precision, and recall
result_df = pd.DataFrame({
    'Model': ['Decision Tree', 'SVM', 'Logistic Regression'],
    'Precision': [decision_tree_precision, svm_precision, logistic_regression_precision],
    'Recall': [decision_tree_recall, svm_recall, logistic_regression_recall]
})

# Print the DataFrame
print("DataFrame containing model names, precision, and recall:")

result_df

DataFrame containing model names, precision, and recall:


Unnamed: 0,Model,Precision,Recall
0,Decision Tree,0.888889,0.924188
1,SVM,0.988889,0.963899
2,Logistic Regression,0.984252,0.902527


### By observing above data, we can say that SVM is the optimal model.