In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load data
titanic_df = pd.read_csv('titanic.csv')

# Display the first few rows and info
print(titanic_df.head())
titanic_df.info()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Data cleaning
titanic_df.isnull().sum()
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)
titanic_df.drop_duplicates(inplace=True)

# Outlier detection
sns.boxplot(titanic_df.iloc[:,1:])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# Select relevant features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
df = titanic_df[features]

# Convert categorical variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df.dropna(inplace=True)

### One-Hot Encoding
One-hot encoding is a technique used to ensure that categorical variables are better represented in the machine. Let's take a look at the "Sex" column

In [None]:
# Encoding categorical data
titanic_df = pd.get_dummies(titanic_df, columns=['Embarked', 'Sex'])
titanic_df['Pclass'] = LabelEncoder().fit_transform(titanic_df['Pclass'])

# Check the resulting dataframe structure
print(df.head())

array(['male', 'female'], dtype=object)

Machine Learning classifiers don't know how to handle strings. As a result, you need to convert it into a categorical representation. There are two main ways to go about this:

Label Encoding: Assigning, for example, 0 for "male" and 1 for "female". The problem here is it intrinsically makes one category "larger than" the other category.

One-hot encoding: Assigning, for example, [1, 0] for "male" and [0, 1] for female. In this case, you have an array of size (n_categories,) and you represent a 1 in the correct index, and 0 elsewhere. In Pandas, this would show as extra columns. For example, rather than having a "Sex" column, it would be a "Sex_male" and "Sex_female" column. Then, if the person is male, it would simply show as a 1 in the "Sex_male" column and a 0 in the "Sex_female" column.

There is a nice and easy method that does this in pandas: get_dummies()

Now, we do the same to the "Embarked" column.

In [None]:
# Split the data
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train decision tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Plot decision tree
import matplotlib.pyplot as plt
from sklearn import tree
plt.figure(figsize=(12, 8))
tree.plot_tree(model, filled=True, feature_names=X_train.columns)
plt.show()

In [None]:
# Evaluate development set accuracy
# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Set Accuracy: {accuracy:.2f}')

In [None]:
# Experiment with different max depths
# Hyperparameter tuning (max depth)
max_depths = range(2, 11)
train_accuracies = []
test_accuracies = []

for depth in max_depths:
    model = DecisionTreeClassifier(max_depth=depth)
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_accuracies.append(accuracy_score(y_train, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))

In [None]:
# Plot training and development accuracies
plt.figure(figsize=(10, 5))
plt.plot(max_depths, train_accuracies, label='Training Accuracy', marker='o')
plt.plot(max_depths, test_accuracies, label='Test Accuracy', marker='o')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Training and Test Accuracies vs. Max Depth')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Analyze line shapes and interpret results
print("Line Shape Analysis:")
if train_accuracies[-1] < train_accuracies[-2]:
    print("Training accuracy decreases at higher max_depth, indicating overfitting.")
else:
    print("Training accuracy continues to improve or remains stable.")

if test_accuracies[-1] > test_accuracies[-2]:
    print("Test accuracy improves at higher max_depth, indicating optimal performance.")
elif test_accuracies[-1] == test_accuracies[-2]:
    print("Test accuracy remains stable at higher max_depth, indicating optimal performance.")
else:
    print("Test accuracy decreases at higher max_depth, indicating overfitting.")

In [None]:
# Determine best max depth
best_depth = max_depths[test_accuracies.index(max(test_accuracies))]
print(f'Best Max Depth: {best_depth}')

In [None]:
# Train final model with best max depth
final_model = DecisionTreeClassifier(max_depth=best_depth)
final_model.fit(X_train, y_train)

In [None]:
# Evaluate final model
y_final_pred = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_final_pred)
print(f'Final Test Set Accuracy: {final_accuracy:.2f}')