## Part I: Naive Bayes Classifier

### Task 1: Theory Questions

In [None]:

from IPython.display import Markdown as md

theory_nb = {
    "1. Core Assumption": "Naive Bayes assumes that all features are conditionally independent given the class label. This means the presence of one feature does not affect the presence of another.",
    "2. Types of Naive Bayes":
        "- GaussianNB: Used for continuous features assuming normal distribution.\n"
        "- MultinomialNB: Best for discrete counts (e.g., word frequencies in text).\n"
        "- BernoulliNB: For binary/boolean features (e.g., word present or not).",
    "3. High-dimensional suitability":
        "Naive Bayes works well with high-dimensional data like text because it simplifies calculations using independence assumption, making it computationally efficient even with many features."
}

for q, a in theory_nb.items():
    display(md(f"**{q}**  \n{a}"))


### Task 2: Spam Detection using MultinomialNB

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_table(url, header=None, names=['label', 'message'])
df['label_num'] = df.label.map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label_num'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


### Task 3: GaussianNB with Iris Dataset

In [None]:

from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred_dt = dtree.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


## Part II: Decision Trees


### Task 4: Conceptual Questions

In [None]:
theory_dt = {
    "1. Entropy & Information Gain":
        "Entropy is a measure of impurity. Information gain is the reduction in entropy after a dataset is split on a feature.",
    "2. Gini vs Entropy":
        "Gini is faster to compute and tends to isolate the most frequent class, while entropy is more theoretical and favors pure splits.",
    "3. Overfitting in Decision Trees":
        "Overfitting occurs when a tree is too deep. Avoid it by pruning, setting max depth, or using ensemble methods."
}

for q, a in theory_dt.items():
    display(md(f"**{q}**  \n{a}"))


### Task 5: Decision Tree on Titanic Dataset


In [None]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Load dataset
titanic = sns.load_dataset("titanic")

# Preprocess
df = titanic[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
df.dropna(subset=['age'], inplace=True)
df['sex'] = LabelEncoder().fit_transform(df['sex'])

X = df.drop('survived', axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
dt = DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)

# Plot
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 8))
plot_tree(dt, feature_names=X.columns, class_names=['Died', 'Survived'], filled=True)
plt.show()

# Evaluate
print("Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt.predict(X_test)))

### Task 6: Model Tuning


In [None]:
train_acc = []
test_acc = []
depths = range(1, 11)

for d in depths:
    model = DecisionTreeClassifier(max_depth=d)
    model.fit(X_train, y_train)
    train_acc.append(model.score(X_train, y_train))
    test_acc.append(model.score(X_test, y_test))

plt.plot(depths, train_acc, label="Train Accuracy")
plt.plot(depths, test_acc, label="Test Accuracy")
plt.xlabel("Max Depth")
plt.ylabel("Accuracy")
plt.title("Overfitting Visualization")
plt.legend()
plt.show()

## Part III: Ensemble Learning

### Task 7: Conceptual Questions


theory_ensemble = {
    "1. Bagging vs Boosting":
        "Bagging trains multiple models independently on random subsets, while boosting trains sequentially to correct errors of previous models.",
    "2. Random Forest Variance":
        "Random Forest reduces variance by averaging multiple de-correlated decision trees trained on different data subsets.",
    "3. Boosting Weakness":
        "Boosting is sensitive to noisy data and can overfit if not properly regularized."
}

for q, a in theory_ensemble.items():
    display(md(f"**{q}**  \n{a}"))

### Task 8: Random Forest vs Decision Tree


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)

# Compare
print("Decision Tree Accuracy:", accuracy_score(y_test, dt.predict(X_test)))
print("Random Forest Accuracy:", accuracy_score(y_test, y_rf))
print("Random Forest Precision:", precision_score(y_test, y_rf))
print("Random Forest Recall:", recall_score(y_test, y_rf))

# Feature Importance
feat_imp = pd.Series(rf.feature_importances_, index=X.columns)
feat_imp.sort_values().plot(kind='barh', title='Feature Importances')
plt.show()

### Task 9: Gradient Boosting


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_gb = gb.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_gb))
from sklearn.metrics import f1_score
print("F1 Score:", f1_score(y_test, y_gb))