# Project 2

 ## Kabore Titanic Data Features


### Project Overview


### Section 1. Import and Inspect the Data

In [None]:
# All imports should be at the top of the notebook

import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning tools
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

In [None]:
# Load Titanic dataset
titanic = sns.load_dataset('titanic')

### Display basic information about the dataset using the info() method.



In [None]:
titanic.info()


### Display the first 10 rows

In [None]:
print(titanic.head(10))


Check for missing values using the isnull

In [None]:
titanic.isnull().sum()


## Summary statistics

In [None]:
print(titanic.describe())


Check for correlations using the corr() method 

In [None]:
print(titanic.corr(numeric_only=True))


## Section 2. Data Exploration and Preparation


### 2.1 Explore Data Patterns and Distributions


### Create a scatter matrix. 



In [None]:
# all imports get moved to the top - import each only once
from pandas.plotting import scatter_matrix


attributes = ['age', 'fare', 'pclass']
scatter_matrix(titanic[attributes], figsize=(10, 10))

### Create a scatter plot of age vs fare, colored by gender:

In [None]:
# all imports get moved to the top - import each only once
import matplotlib.pyplot as plt


In [None]:
plt.scatter(titanic['age'], titanic['fare'], c=titanic['sex'].apply(lambda x: 0 if x == 'male' else 1))
plt.xlabel('Age')
plt.ylabel('Fare')
plt.title('Age vs Fare by Gender')
plt.show()

### plt.scatter(titanic['age'], titanic['fare'], c=titanic['sex'].apply(lambda x: 0 if x == 'male' else 1))
plt.xlabel('Age')
plt.ylabel('Fare')
plt.title('Age vs Fare by Gender')
plt.show()

In [None]:
Create a histogram of age:




sns.histplot(titanic['age'], kde=True)
plt.title('Age Distribution')
plt.show()

Create a count plot for class and survival:



In [None]:
sns.countplot(x='class', hue='survived', data=titanic)
plt.title('Class Distribution by Survival')
plt.show()

## Reflection 2.1:



## 2.2 Handle Missing Values and Clean Data



In [None]:
titanic['age'].fillna(titanic['age'].median(), inplace=True)


In [None]:
titanic['embark_town'].fillna(titanic['embark_town'].mode()[0], inplace=True)



## 2.3 Feature Engineering



titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1



### Convert categorical data to numeric:



In [None]:
titanic['sex'] = titanic['sex'].map({'male': 0, 'female': 1})
titanic['embarked'] = titanic['embarked'].map({'C': 0, 'Q': 1, 'S': 2})



### Create a binary feature for 'alone':



In [None]:
titanic['alone'] = titanic['alone'].astype(int)



## Reflection 2.3



## Section 3. Feature Selection and Justification


### 3.1 Choose features and target



### 3.2 Define X and y



In [None]:
X = titanic[['age', 'fare', 'pclass', 'sex', 'family_size']]
y = titanic['survived']

## Reflection 3:



## Section 4. Splitting


### Basic Train/Test split 


In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(X, y, test_size=0.2, random_state=123)

print('Train size:', len(train_set))
print('Test size:', len(test_set))

### Stratified Train/Test split


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)

for train_indices, test_indices in splitter.split(X, y):
    train_set = X.iloc[train_indices]
    test_set = X.iloc[test_indices]

print('Train size:', len(train_set))
print('Test size:', len(test_set))

### Compare Results


In [None]:
print("Original Class Distribution:\n", y.value_counts(normalize=True))
print("Train Set Class Distribution:\n", train_set['pclass'].value_counts(normalize=True))
print("Test Set Class Distribution:\n", test_set['pclass'].value_counts(normalize=True))

Reflection 4:

