In [1]:
import nltk
import spacy


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [3]:
# Step 1: Load the dataset
file_path = './spam_sms_collection.tsv'
data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
print(data)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
# Step 2: Data preprocessing
# Convert labels to binary (1 for spam, 0 for ham)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
print(data.head(300))

     label                                            message
0        0  Go until jurong point, crazy.. Available only ...
1        0                      Ok lar... Joking wif u oni...
2        1  Free entry in 2 a wkly comp to win FA Cup fina...
3        0  U dun say so early hor... U c already then say...
4        0  Nah I don't think he goes to usf, he lives aro...
..     ...                                                ...
295      0  I accidentally deleted the message. Resend ple...
296      1  T-Mobile customer you may now claim your FREE ...
297      0  Unless it's a situation where YOU GO GURL woul...
298      0  Hurt me... Tease me... Make me cry... But in t...
299      0  I cant pick the phone right now. Pls send a me...

[300 rows x 2 columns]


In [5]:
# Step 3: Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)
print("X_train = " , X_train)
print("X_test = " , X_test)

X_train =  1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: message, Length: 4457, dtype: object
X_test =  3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
        

In [6]:
# Step 4: Text feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9 ,max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [7]:
# Step 5: Train and evaluate models
# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
nb_f1 = f1_score(y_test, y_pred_nb)
nb_accuracy = accuracy_score(y_test, y_pred_nb)

In [8]:
# Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)
y_pred_dt = dt_model.predict(X_test_tfidf)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

In [10]:
nb_report = classification_report(y_test, y_pred_nb)
dt_report = classification_report(y_test, y_pred_dt)

In [11]:
# Step 6: Save results to a file
with open('./model_results.txt', 'w') as file:
    file.write("Naive Bayes Model:\n")
    file.write(f"Accuracy: {nb_accuracy}\n")
    file.write(f"F1 Score: {nb_f1}\n")
    file.write("Classification Report:\n")
    file.write(nb_report)
    file.write("\n\nDecision Tree Model:\n")
    file.write(f"Accuracy: {dt_accuracy}\n")
    file.write(f"F1 Score: {dt_f1}\n")
    file.write("Classification Report:\n")
    file.write(dt_report)

print("Results have been saved to 'model_results.txt'")

Results have been saved to 'model_results.txt'


In [9]:
# Step 6: Display results
print("Naive Bayes Model:")
print("Accuracy:", nb_accuracy)
print("F1 Score:", nb_f1)
print(classification_report(y_test, y_pred_nb))

print("\nDecision Tree Model:")
print("Accuracy:", dt_accuracy)
print("F1 Score:", dt_f1)
print(classification_report(y_test, y_pred_dt))


Naive Bayes Model:
Accuracy: 0.97847533632287
F1 Score: 0.9124087591240876
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Decision Tree Model:
Accuracy: 0.968609865470852
F1 Score: 0.8788927335640139
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       966
           1       0.91      0.85      0.88       149

    accuracy                           0.97      1115
   macro avg       0.94      0.92      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Step 1: Load the dataset
file_path = './spam_sms_collection.tsv'
data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

# Step 2: Data preprocessing
# Convert labels to binary (1 for spam, 0 for ham)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Step 3: Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

# Step 4: Text feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 5: Train and evaluate models
# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
nb_f1 = f1_score(y_test, y_pred_nb)
nb_accuracy = accuracy_score(y_test, y_pred_nb)

# Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)
y_pred_dt = dt_model.predict(X_test_tfidf)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

# Step 6: Display results
print("Naive Bayes Model:")
print("Accuracy:", nb_accuracy)
print("F1 Score:", nb_f1)
print(classification_report(y_test, y_pred_nb))

print("\nDecision Tree Model:")
print("Accuracy:", dt_accuracy)
print("F1 Score:", dt_f1)
print(classification_report(y_test, y_pred_dt))
