In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1', 'utf-16']
for encoding in encodings_to_try:
    try:
        spam = pd.read_csv(r"C:\Users\amitp\OneDrive\Documents\Projects_3rd Year\CodSoft\Spam Sms Detection\archive (6)\spam.csv", encoding=encoding)
        break
    except UnicodeDecodeError:
        print(f"Failed to decode using {encoding} encoding. Trying the next one.")

Failed to decode using utf-8 encoding. Trying the next one.


In [14]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
spam['v2'] = spam['v2'].str.lower()

In [16]:
x_train, x_test, y_train, y_test = train_test_split(spam['v2'], spam['v1'], test_size=0.2, random_state=42)

In [17]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [18]:
classifier = LogisticRegression()
classifier.fit(x_train_tfidf, y_train)

In [19]:
y_pred = classifier.predict(x_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [20]:
print(f'Accuracy: {accuracy}')

Accuracy: 0.967713004484305


In [21]:
print(f'Confusion Matrix:\n{confusion}')

Confusion Matrix:
[[964   1]
 [ 35 115]]


In [22]:
print(f'Classification Report:\n{classification_rep}')

Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.99      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [23]:
new_messages = ["need your help now", "Meeting at 3 pm today."]
new_messages_tfidf = tfidf_vectorizer.transform(new_messages)
predictions = classifier.predict(new_messages_tfidf)

for message, prediction in zip(new_messages, predictions):
    print(f"Message: {message}\nPrediction: {'spam' if prediction == 'spam' else 'ham'}\n")

Message: need your help now
Prediction: ham

Message: Meeting at 3 pm today.
Prediction: ham

