In [2]:
# In[1]
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import pickle

# In[2]
# Load the data with encoding handling and remove unnamed columns
file_path = r"C:\Users\bssha\Downloads\spam detectopn\sms_spam_classifier\data\spam.csv"

# Try different encodings if necessary
try:
    df = pd.read_csv(file_path, encoding='latin1')
except UnicodeDecodeError:
    try:
        df = pd.read_csv(file_path, encoding='iso-8859-1')
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='cp1252')

# Remove unwanted columns (Unnamed columns)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# In[3]
# Print columns to verify
print("Columns in the dataset:")
print(df.columns)

# Inspect the first few rows to understand the data structure
print("First few rows of the dataset:")
print(df.head())

# In[4]
# Define a function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    return text

# Use 'v2' for text data and 'v1' for labels
df['cleaned_message'] = df['v2'].apply(clean_text)

# Check the first few rows to verify preprocessing
print(df.head())

# In[5]
# Define features and labels
X = df['cleaned_message']
y = df['v1']

# Feature extraction using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_transformed = vectorizer.fit_transform(X)

# In[6]
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

# In[7]
# Train a model (Naive Bayes example)
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# In[8]
# Model Tuning
param_grid = {
    'alpha': [0.5, 1.0, 1.5, 2.0]  # Example parameter for MultinomialNB
}
grid_search = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:")
print(grid_search.best_params_)
print("Best Score:")
print(grid_search.best_score_)

# Use the best model from grid search
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Classification Report for Best Model:")
print(classification_report(y_test, y_pred_best))
print("Confusion Matrix for Best Model:")
print(confusion_matrix(y_test, y_pred_best))

# In[9]
# Save the model and vectorizer
with open('spam_classifier_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

print("Model and vectorizer saved successfully.")


Columns in the dataset:
Index(['v1', 'v2'], dtype='object')
First few rows of the dataset:
     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
     v1                                                 v2  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go until jurong point crazy available only in ...  
1                           ok lar joking wif u oni   
2  free entry in 