#Initialisation:Importing Libraries and Data

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
train_data = pd.read_csv('train.csv')

In [7]:
train_data.head()

Unnamed: 0,label,text,EmailID
0,Ham,eric called me last night and i am happy to co...,0.0
1,Spam,tr what is funnygadget com funnygadget com ca...,1.0
2,Ham,"ricky sent me the nom early - 87 , 000 for the...",2.0
3,Ham,these are the numbers that are being exported ...,3.0
4,Spam,authentic replica rolex and other watches for ...,4.0


#Preprocessing and Feature Engineering

In [8]:
#Convert text to lowercase
train_data['text'] = train_data['text'].str.lower()

In [9]:
#Remove punctuation
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [10]:
#Tokenize text
train_data['text'] = train_data['text'].apply(word_tokenize)

In [11]:
#Remove stopwords
stop_words = set(stopwords.words('english'))
train_data['text'] = train_data['text'].apply(lambda x: [word for word in x if word not in stop_words])

In [12]:
#Lemmatize tokens
lemmatizer = WordNetLemmatizer()
train_data['text'] = train_data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [13]:
#Join tokens back into text
train_data['text'] = train_data['text'].apply(lambda x: ' '.join(x))

In [14]:
X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['label'], test_size=0.2, random_state=42)

In [15]:
# Vectorize
tfidf_vectorizer = TfidfVectorizer(max_features=6500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

#Model Training

##Using Support Vector Classifier

In [16]:
model = SVC(kernel='linear')
val_predictions = model.fit(X_train_tfidf, y_train)

In [17]:
val_predictions = model.predict(X_val_tfidf)
accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy:", accuracy)
print(f"Classification Report:")
print(classification_report(y_val, val_predictions))

Validation Accuracy: 0.966848095002474
Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      0.96      0.97      2110
        Spam       0.96      0.97      0.97      1932

    accuracy                           0.97      4042
   macro avg       0.97      0.97      0.97      4042
weighted avg       0.97      0.97      0.97      4042



## Using MultinomialNB

In [18]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_predictions = nb_model.predict(X_val_tfidf)

In [19]:
nb_accuracy = accuracy_score(y_val, nb_predictions)
print(f"MultinomialNB Validation Accuracy: {nb_accuracy}")
print(f"MultinomialNB Classification Report:")
print(classification_report(y_val, nb_predictions))

MultinomialNB Validation Accuracy: 0.946561108362197
MultinomialNB Classification Report:
              precision    recall  f1-score   support

         Ham       0.94      0.96      0.95      2110
        Spam       0.95      0.93      0.94      1932

    accuracy                           0.95      4042
   macro avg       0.95      0.95      0.95      4042
weighted avg       0.95      0.95      0.95      4042



#Prediction on Test Data

In [20]:
test_data = pd.read_csv('test.csv')

In [21]:
test_data.head()

Unnamed: 0,text,EmailID
0,monika try calling ron heller escapenumber esc...,46524
1,"dear all ,\nattached is a paper describing the...",46525
2,arm inc e yo xual des spe ume reas ur se ire r...,46526
3,you are the man http ourmix hk,46527
4,kay mann enron com bmm b b b b b b b escapenum...,46528


##Preprocessing test data

In [22]:
#Convert text to lowercase
test_data['text'] = test_data['text'].str.lower()

In [23]:
#Remove punctuation
test_data['text'] = test_data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [24]:
#Tokenize text
test_data['text'] = test_data['text'].apply(word_tokenize)

In [25]:
#Remove stopwords
stop_words = set(stopwords.words('english'))
test_data['text'] = test_data['text'].apply(lambda x: [word for word in x if word not in stop_words])

In [26]:
#Lemmatize tokens
lemmatizer = WordNetLemmatizer()
test_data['text'] = test_data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [27]:
#Join tokens back into text
train_data['text'] = train_data['text'].apply(lambda x: ' '.join(x))
test_data['text'] = test_data['text'].apply(lambda x: ' '.join(x))

In [28]:
#Vectorise
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])

##Making and Saving Predictions

In [29]:
test_predictions = model.predict(X_test_tfidf)
submission_df = pd.DataFrame({'EmailID': test_data['EmailID'], 'label': test_predictions})
submission_df.to_csv('submission.csv', index=False)