In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from collections import defaultdict


In [None]:

nltk.download('stopwords')
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/IMDB/IMDB Dataset.csv")

In [None]:
df['clean_review'] = df['review'].apply(preprocess_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorization
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)




In [None]:
# Train Naive Bayes
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)



In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8542
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.86      4961
           1       0.87      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
 [[4326  635]
 [ 823 4216]]
ROC-AUC Score: 0.854337777910408


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

In [None]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif


X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

# Scale data before feature selection
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

selector = SelectKBest(score_func=f_classif, k=5)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)



In [None]:

model = LogisticRegression(max_iter=1000, solver='saga')
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X_train, y_train)
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)


In [None]:
model.fit(X_train_selected, y_train)
y_pred_selected = model.predict(X_test_selected)



In [None]:
print("\nFeature Selection with SelectKBest")
print("Accuracy:", accuracy_score(y_test, y_pred_selected))
print("Classification Report:\n", classification_report(y_test, y_pred_selected))



Feature Selection with SelectKBest
Accuracy: 0.9736842105263158
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.93      0.96        43
           1       0.96      1.00      0.98        71

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Feature Selection with RFE
Accuracy: 0.9736842105263158
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.93      0.96        43
           1       0.96      1.00      0.98        71

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [None]:
# Task 3
def train_naive_bayes(X, y):
    vocab = set(word for text in X for word in text.split())
    vocab_size = len(vocab)
    class_word_counts = {0: defaultdict(int), 1: defaultdict(int)}
    class_counts = {0: 0, 1: 0}

    for text, label in zip(X, y):
        class_counts[label] += 1
        for word in text.split():
            class_word_counts[label][word] += 1

    priors = {c: np.log(class_counts[c] / len(y)) for c in class_counts}
    likelihoods = {c: {word: np.log((class_word_counts[c][word] + 1) / (sum(class_word_counts[c].values()) + vocab_size)) for word in vocab} for c in class_counts}

    return priors, likelihoods, vocab


In [None]:
def predict_naive_bayes(X, priors, likelihoods, vocab):
    predictions = []
    for text in X:
        scores = {c: priors[c] for c in priors}
        for word in text.split():
            if word in vocab:
                scores[0] += likelihoods[0].get(word, 0)
                scores[1] += likelihoods[1].get(word, 0)
        predictions.append(1 if scores[1] > scores[0] else 0)
    return predictions

In [None]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/IMDB/spam_ham_dataset.csv")
df = df[df['label'].isin(['ham', 'spam'])]
df['clean_text'] = df['text'].apply(preprocess_text)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# Train model
priors, likelihoods, vocab = train_naive_bayes(X_train, y_train)

# Predict
y_pred = predict_naive_bayes(X_test, priors, likelihoods, vocab)



In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9729468599033816
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       742
           1       0.95      0.95      0.95       293

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035

