In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('/content/archive (14).zip', encoding='latin-1')
data = data.rename(columns={'v1': 'label', 'v2': 'message'})  # Rename columns
data['label'] = data['label'].map({'ham': 0, 'spam': 1})  # Map labels to 0 and 1

# Preprocessing (if needed - you might have already done this)
# ... (Refer to previous responses for preprocessing steps) ...

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 1. Naive Bayes Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_tfidf.toarray(), y_train)  # Convert to dense array
nb_predictions = nb_classifier.predict(X_test_tfidf.toarray())
nb_accuracy = accuracy_score(y_test, nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))

# 2. Logistic Regression Classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)
lr_predictions = lr_classifier.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(y_test, lr_predictions))

# 3. Support Vector Machine Classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", classification_report(y_test, svm_predictions))

# Choose the best performing model based on accuracy and other metrics
# ...

Naive Bayes Accuracy: 0.8968609865470852
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.90      0.94       965
           1       0.57      0.90      0.70       150

    accuracy                           0.90      1115
   macro avg       0.78      0.90      0.82      1115
weighted avg       0.93      0.90      0.91      1115

Logistic Regression Accuracy: 0.967713004484305
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.99      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

SVM Accuracy: 0.9820627802690582
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
       

In [3]:
!pip install scikit-learn



In [3]:
data.info(),data.describe

In [4]:
data.dropna().isnull()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
281,False,False,False,False,False
1038,False,False,False,False,False
2255,False,False,False,False,False
3525,False,False,False,False,False
4668,False,False,False,False,False
5048,False,False,False,False,False


In [6]:
data.fillna

In [7]:
data.head()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
data.describe()

Unnamed: 0,label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [9]:
data.tail()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,1,This is the 2nd time we have tried 2 contact u...,,,
5568,0,Will Ì_ b going to esplanade fr home?,,,
5569,0,"Pity, * was in mood for that. So...any other s...",,,
5570,0,The guy did some bitching but I acted like i'd...,,,
5571,0,Rofl. Its true to its name,,,


In [12]:
data.columns

Index(['label', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [14]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4825
1,747


In [15]:
data['message'].value_counts()

Unnamed: 0_level_0,count
message,Unnamed: 1_level_1
"Sorry, I'll call later",30
I cant pick the phone right now. Pls send a message,12
Ok...,10
"7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st \Ur Lovely Friendship\""... good morning dear""",4
"Say this slowly.? GOD,I LOVE YOU &amp; I NEED YOU,CLEAN MY HEART WITH YOUR BLOOD.Send this to Ten special people &amp; u c miracle tomorrow, do it,pls,pls do it...",4
...,...
I gotta collect da car at 6 lei.,1
No. On the way home. So if not for the long dry spell the season would have been over,1
Urgent! Please call 09061743811 from landline. Your ABTA complimentary 4* Tenerife Holiday or å£5000 cash await collection SAE T&Cs Box 326 CW25WX 150ppm,1
Dear 0776xxxxxxx U've been invited to XCHAT. This is our final attempt to contact u! Txt CHAT to 86688 150p/MsgrcvdHG/Suite342/2Lands/Row/W1J6HL LDN 18yrs,1


In [16]:
data['message'] = data['message'].str.lower().str.replace('[^a-zA-Z0-9\s]', '', regex=True)