In [1]:
import csv
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve


In [2]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rubas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rubas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
def load_dataset(file_path):
    dataset = []
    encodings_to_try = ['utf-8', 'latin-1', 'windows-1256']  # Add more encodings to try if needed
    
    for encoding in encodings_to_try:
        try:
            with open(file_path, 'r', encoding=encoding) as csv_file:
                csv_reader = csv.reader(csv_file)
                next(csv_reader)  # Skip the header row
                for row in csv_reader:
                    text = row[1]  # Assuming text is in the second column
                    label = int(row[2])  # Assuming label is in the third column
                    dataset.append((text, label))
            return dataset
        except UnicodeDecodeError:
            continue
    
    # If none of the encodings work
    raise Exception("Unable to decode the dataset using any of the specified encodings")

# Rest of your code remains unchanged


In [44]:
# Preprocessing
stop_words = set(stopwords.words('arabic'))

def preprocess(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return dict([(word, True) for word in filtered_words])

In [45]:
# Load and preprocess the dataset
file_path = r'C:\Users\rubas\Downloads\combined_cleaned_data.csv'
dataset = load_dataset(file_path)
preprocessed_dataset = [(preprocess(text), label) for text, label in dataset]


In [46]:

# Split dataset into training and testing sets
train_size = int(0.8 * len(preprocessed_dataset))
train_set = preprocessed_dataset[:train_size]
test_set = preprocessed_dataset[train_size:]


In [47]:
# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)


In [48]:

# Evaluate the classifier
#true_labels = [label for (_, label) in test_set]
#predicted_labels = [classifier.classify(preprocess(text)) for (text, _) in test_set]
# Evaluate the classifier
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)


Accuracy: 0.8630590833793484


In [39]:
true_labels = [label for (_, label) in test_set]
predicted_labels = [classifier.classify(preprocess(text)) for (text, _) in test_set]


TypeError: expected string or bytes-like object

In [21]:
# Confusion Matrix
cm = confusion_matrix(true_labels, predicted_labels)
plt.figure()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
classes = ["Ham", "Spam"]
plt.xticks(np.arange(len(classes)), classes, rotation=45)
plt.yticks(np.arange(len(classes)), classes)
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.tight_layout()

NameError: name 'predicted_labels' is not defined

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(true_labels, predicted_labels)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")


In [None]:
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(true_labels, predicted_labels)
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')

# Show all plots
plt.show()