Salary Dataset

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
salary_data = pd.read_csv(r"salaries.csv")
salary_data.head()

In [None]:
label_encoders = {}
for col in ['company', 'job', 'degree']:
    le = LabelEncoder()
    salary_data[col] = le.fit_transform(salary_data[col])
    label_encoders[col] = le

In [None]:
X_salary = salary_data[['company', 'job', 'degree']]
y_salary = salary_data['salary_more_than_100k']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_salary, y_salary, test_size=0.2, random_state=42)

In [None]:
salary_model = DecisionTreeClassifier(criterion='entropy', random_state=42)
salary_model.fit(X_train, y_train)

In [None]:
salary_predictions = salary_model.predict(X_test)
print("Salary Prediction Accuracy:", accuracy_score(y_test, salary_predictions))
print(classification_report(y_test, salary_predictions))

In [None]:
plt.figure(figsize=(12,8)) # (12,8)
plot_tree(salary_model, feature_names=['company', 'job', 'degree'], class_names=['<100K', '>100K'], filled=True, rounded=True, fontsize=10)
plt.show()

In [None]:
sal_cm = confusion_matrix(y_test, salary_predictions, labels=salary_model.classes_)

ConfusionMatrixDisplay(confusion_matrix=sal_cm, display_labels=['<=100k', '>100k']).plot()
plt.title('Initial Salary Model Confusion Matrix')
plt.show()

Titanic Dataset

In [None]:
titanic_data = pd.read_csv(r"titanic.csv")

# Now you can drop the unnecessary columns
titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [None]:
# Handle missing values
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())
titanic_data['Embarked'] = titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0])

In [None]:
for col in ['Sex', 'Embarked']:
    le = LabelEncoder()
    titanic_data[col] = le.fit_transform(titanic_data[col])
    label_encoders[col] = le

In [None]:
X_titanic = titanic_data.drop('Survived', axis=1)
y_titanic = titanic_data['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic, y_titanic, test_size=0.2, random_state=42)

In [None]:
titanic_model = DecisionTreeClassifier(criterion='entropy', random_state=42)
titanic_model.fit(X_train, y_train)

In [None]:
titanic_predictions = titanic_model.predict(X_test)
print("Titanic Survival Prediction Accuracy:", accuracy_score(y_test, titanic_predictions))
print(classification_report(y_test, titanic_predictions))

In [None]:
plt.figure(figsize=(20,15))
plot_tree(titanic_model, feature_names=X_titanic.columns, class_names=['Died', 'Survived'], filled=True, rounded=True, fontsize=12, max_depth=3)
plt.show()

In [None]:
titanic_cm = confusion_matrix(y_test, titanic_predictions, labels=titanic_model.classes_)

ConfusionMatrixDisplay(confusion_matrix=titanic_cm, display_labels=['Died', 'Survived']).plot()
plt.title('Initial Titanic Model Confusion Matrix')
plt.show()