# <a id='toc1_'></a>[Titanic Project](#toc0_)

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [103]:
train_data = pd.read_csv("train.csv")
train_data.head()



In [104]:
train_data.info()



In [105]:
train_data.isnull().sum()



In [106]:
train_data["Age"].fillna(train_data["Age"].mean(), inplace=True)
train_data["Cabin"].fillna("N", inplace=True)
train_data["Embarked"].fillna("N", inplace=True)
train_data.isnull().sum()





In [107]:
sns.countplot(data=train_data,x="Survived", hue="Sex")





In [108]:
train_data.info()



In [109]:
survivor_by_sex = train_data.groupby("Sex")["Survived"].agg(["count", "sum"])
survivor_by_sex["percentage"] = survivor_by_sex["sum"] / survivor_by_sex["count"] * 100
survivor_by_sex.columns = ["count", "survived", "percentage"]

pd.DataFrame(survivor_by_sex)



In [110]:
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=survivor_by_sex.reset_index(), x='Sex', y='percentage')

# Add percentage labels on top of bars
for i in ax.containers:
    ax.bar_label(i, fmt='%.1f%%', padding=3)

# Customize the plot
plt.title('Survival Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Survival Rate (%)')

# Optional: Adjust y-axis to start from 0 and end at 100
plt.ylim(0, 100)

plt.show()



In [None]:
## Split the data 



X_train = train_data.drop(["Name","Survived","Ticket","Cabin"], axis=1)


y_train = train_data["Survived"]

X_train.info()

print(f"Number of categorical columns: {sum(X_train.dtypes==object)}")
print(f"Categorical columns: {X_train.columns[X_train.dtypes==object].tolist()}")



## <a id='toc1_1_'></a>[OneHotEncoding for categorical columns](#toc0_)

In [112]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
np.random.seed(42)

one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_columns = X_train.columns[X_train.dtypes==object].tolist()

# Create a transformer
transform = ColumnTransformer([("one_hot",one_hot_encoder,categorical_columns)], remainder='passthrough')

new_x = transform.fit_transform(X_train)


In [113]:
# Check the shape of transformed data
print("Original shape:", X_train.shape)
print("Transformed shape:", new_x.shape)



## <a id='toc1_2_'></a>[Model Building](#toc0_)

In [114]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(new_x, y_train)



## <a id='toc1_3_'></a>[Test the model](#toc0_)

In [115]:
test_data = pd.read_csv("test.csv")
test_data.info()



In [9]:
test_data["Age"].fillna(test_data["Age"].mean(), inplace=True)
test_data["Cabin"].fillna("N", inplace=True)
test_data["Fare"].fillna(test_data["Fare"].mean(), inplace=True)

test_data.isnull().sum()

NameError: name 'test_data' is not defined

**Table of contents**<a id='toc0_'></a>    
- [Titanic Project](#toc1_)    
  - [OneHotEncoding for categorical columns](#toc1_1_)    
  - [Model Building](#toc1_2_)    
  - [Test the model](#toc1_3_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [117]:
X_test = test_data
new_x_test = transform.transform(X_test)
new_x_test.shape


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score


y_pred = model.predict(new_x_test)


In [118]:
y_test = pd.read_csv("gender_submission.csv")["Survived"]

In [119]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

print(classification_report(y_test, y_pred))




In [120]:
# Create DataFrame with predictions
submission_df = pd.DataFrame({
    'PassengerId': pd.read_csv("gender_submission.csv")['PassengerId'],
    'survived_test': y_test,
    'Survived_pred': y_pred
})

submission_df.head(100)

