<a href="https://colab.research.google.com/github/ahmerayaz2000/CS351-AI-lab-2022070/blob/main/2022070_lab04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic_data = pd.read_csv(url)

# Display the first few rows
print(titanic_data.head())

# Visualizing the distribution of key features
sns.countplot(x='Pclass', data=titanic_data)
plt.title('Passenger Class Distribution')
plt.show()

sns.histplot(titanic_data['Age'].dropna(), bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

sns.countplot(x='Sex', data=titanic_data)
plt.title('Gender Distribution')
plt.show()

# Check for missing values
print(titanic_data.isnull().sum())

# Handle missing values
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)
titanic_data.drop(['Cabin'], axis=1, inplace=True)

# Encode categorical variables
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
titanic_data[['Age', 'Fare']] = scaler.fit_transform(titanic_data[['Age', 'Fare']])

# Split the dataset into features and target variable
X = titanic_data.drop(['Survived', 'Name', 'Ticket', 'PassengerId'], axis=1)
y = titanic_data['Survived']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Implementing k-NN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Implementing Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# k-NN Predictions
knn_predictions = knn.predict(X_test)

# Decision Tree Predictions
dtree_predictions = dtree.predict(X_test)

# Evaluate k-NN
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions)
knn_recall = recall_score(y_test, knn_predictions)
knn_f1 = f1_score(y_test, knn_predictions)

# Evaluate Decision Tree
dtree_accuracy = accuracy_score(y_test, dtree_predictions)
dtree_precision = precision_score(y_test, dtree_predictions)
dtree_recall = recall_score(y_test, dtree_predictions)
dtree_f1 = f1_score(y_test, dtree_predictions)

# Print the results
print("k-NN Performance:")
print(f"Accuracy: {knn_accuracy:.2f}, Precision: {knn_precision:.2f}, Recall: {knn_recall:.2f}, F1 Score: {knn_f1:.2f}")

print("\nDecision Tree Performance:")
print(f"Accuracy: {dtree_accuracy:.2f}, Precision: {dtree_precision:.2f}, Recall: {dtree_recall:.2f}, F1 Score: {dtree_f1:.2f}")

# For visualization, we will use 'Pclass' and 'Sex'
X_viz = titanic_data[['Pclass', 'Sex']]
y_viz = titanic_data['Survived']

# Create a meshgrid for visualization
x_min, x_max = X_viz['Pclass'].min() - 1, X_viz['Pclass'].max() + 1
y_min, y_max = X_viz['Sex'].min() - 1, X_viz['Sex'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

# Plot decision boundaries for k-NN
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
Z_knn = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z_knn = Z_knn.reshape(xx.shape)
plt.contourf(xx, yy, Z_knn, alpha=0.3)
plt.scatter(X_viz['Pclass'], X_viz['Sex'], c=y_viz, edgecolors='k', marker='o')
plt.title('k-NN Decision Boundary')
plt.xlabel('Pclass')
plt.ylabel('Sex')

# Plot decision boundaries for Decision Tree
plt.subplot(1, 2, 2)
Z_dtree = dtree.predict(np.c_[xx.ravel(), yy.ravel()])
Z_dtree = Z_dtree.reshape(xx.shape)
plt.contourf(xx, yy, Z_dtree, alpha=0.3)
plt.scatter(X_viz['Pclass'], X_viz['Sex'], c=y_viz, edgecolors='k', marker='o')
plt.title('Decision Tree Decision Boundary')
plt.xlabel('Pclass')
plt.ylabel('Sex')

plt.show()

# Performance metrics visualization
labels = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
knn_scores = [knn_accuracy, knn_precision, knn_recall, knn_f1]
dtree_scores = [dtree_accuracy, dtree_precision, dtree_recall, dtree_f1]

x = np.arange(len(labels))  # the label locations

fig, ax = plt.subplots()
bar_width = 0.35
rects1 = ax.bar(x - bar_width/2, knn_scores, bar_width, label='k-NN')
rects2 = ax.bar(x + bar_width/2, dtree_scores, bar_width, label='Decision Tree')

# Add labels and title
ax.set_xlabel('Metrics')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()
