<a href="https://colab.research.google.com/github/akashshukla7458/spamsms/blob/main/spamcall2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
import pandas as pd
import io
import seaborn as sns
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, classification_report



In [None]:
from google.colab import files
uploaded = files.upload()

Saving fraudcall.txt to fraudcall.txt


In [None]:
# Load data
data = pd.read_csv('fraudcall.txt', sep='\t', names=['label','content'])
data.drop_duplicates(inplace=True) # Remove duplicates


In [None]:
# Perform text preprocessing
nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    words = ' '.join(words)
    return words



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data['content'] = data['content'].apply(preprocess_text)

# Create train and test sets
X = data['content']
y = data['label']
tfidf = TfidfVectorizer(ngram_range=(1,2))
X = tfidf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train and evaluate the model
params = {'alpha': [0.01, 0.1, 1, 10]}
nb = MultinomialNB()
grid_search = GridSearchCV(estimator=nb, param_grid=params, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best parameters:", best_params)
print("Best cross-validation score:", best_score)


Best parameters: {'alpha': 0.1}
Best cross-validation score: 0.9726057906458798


In [None]:
nb = MultinomialNB(alpha=best_params['alpha'])
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
report = classification_report(y_test, y_pred)
print("Confusion matrix:", conf_mat)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Classification report:\n", report)

Confusion matrix: [[989  25]
 [  7 102]]
Accuracy: 0.9715048975957258
Recall: 0.9555624920833108
Classification report:
               precision    recall  f1-score   support

        hamm       0.99      0.98      0.98      1014
        spam       0.80      0.94      0.86       109

    accuracy                           0.97      1123
   macro avg       0.90      0.96      0.92      1123
weighted avg       0.97      0.97      0.97      1123



In [14]:
# Test the model on new text
new_text = "I received a call from someone claiming to be from my bank, asking for my personal information. Is this a scam?"
preprocessed_text = preprocess_text(new_text)
vectorized_text = tfidf.transform([preprocessed_text])
prediction = nb.predict(vectorized_text)[0]
print("Prediction:", prediction)


Prediction: spam
