In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


nltk.download('punkt')
nltk.download('stopwords')


df = pd.read_csv('/content/spam.csv',encoding='latin-1')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)


df['processed_message'] = df['v2'].apply(preprocess_text)


print(df[['v2', 'processed_message']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                  v2  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                   processed_message  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri wkli comp win fa cup final tkt st m...  
3                u dun say earli hor u c alreadi say  
4          nah dont think goe usf live around though  


In [None]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...",,,,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,,,,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,,,,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,nah dont think goe usf live around though
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,,nd time tri contact u u å pound prize claim ea...
5568,ham,Will Ì_ b going to esplanade fr home?,,,,ì_ b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",,,,piti mood soani suggest
5570,ham,The guy did some bitching but I acted like i'd...,,,,guy bitch act like id interest buy someth els ...


In [None]:
df.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"],inplace=True,axis=1)

In [None]:
df.head()

Unnamed: 0,v1,v2,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


X = df['processed_message']
y = df['v1']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"Number of features: {len(feature_names)}")
print(f"Shape of training data: {X_train_tfidf.shape}")
print(f"Shape of testing data: {X_test_tfidf.shape}")

Number of features: 5000
Shape of training data: (4457, 5000)
Shape of testing data: (1115, 5000)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
nb_predictions = nb_classifier.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_predictions)
print("Naive Bayes Classifier")
print(f"Accuracy: {nb_accuracy}")
print(classification_report(y_test, nb_predictions))

lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_tfidf, y_train)
lr_predictions = lr_classifier.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("\nLogistic Regression Classifier")
print(f"Accuracy: {lr_accuracy}")
print(classification_report(y_test, lr_predictions))

svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("\nSupport Vector Machine Classifier")
print(f"Accuracy: {svm_accuracy}")
print(classification_report(y_test, svm_predictions))


Naive Bayes Classifier
Accuracy: 0.968609865470852
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Logistic Regression Classifier
Accuracy: 0.9515695067264573
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115


Support Vector Machine Classifier
Accuracy: 0.9757847533632287
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       0.99      0.83      0.90       150

    ac

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_tfidf, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validation score: {best_score}")

final_svm_classifier = SVC(**best_params)
final_svm_classifier.fit(X_train_tfidf, y_train)
final_svm_predictions = final_svm_classifier.predict(X_test_tfidf)
final_svm_accuracy = accuracy_score(y_test, final_svm_predictions)

print(f"Final SVM Classifier Accuracy: {final_svm_accuracy}")
print(classification_report(y_test, final_svm_predictions))


Best parameters: {'C': 10, 'kernel': 'linear'}
Best cross-validation score: 0.9800299960240169
Final SVM Classifier Accuracy: 0.9775784753363229
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       965
        spam       0.94      0.89      0.91       150

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

