<a href="https://www.kaggle.com/code/burakaltunda/titanic-ipynb?scriptVersionId=295696196" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score

# Load Dataset

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

train_data.info()

In [None]:
train_data.describe()

# Missing Values

In [None]:
train_data.isnull().sum()

In [None]:
train_data.drop("Cabin" ,axis=1 ,inplace=True)

In [None]:
train_data.head(20)

In [None]:
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].mean())
train_data.isnull().sum()

In [None]:
train_data["Embarked"] = train_data["Embarked"].fillna(train_data["Embarked"].mode()[0])
train_data.isnull().sum()

In [None]:
korelasyon_df = train_data[["Survived" , "Pclass", "Age" , "SibSp" , "Parch","Fare"]]
korelasyon = korelasyon_df.corr()
sns.heatmap(korelasyon , annot = True,fmt=".2f",cmap="coolwarm")

In [None]:
train_data.drop("Name" , axis = 1 , inplace = True)
train_data.drop("Ticket" , axis = 1 , inplace = True)
train_data.info()

In [None]:
train_data.drop("PassengerId" , axis = 1 , inplace = True)
train_data.head(20)

# Outliers

In [None]:
fig , axes = plt.subplots(2,2,figsize=(15,10))
sns.boxplot(ax = axes[0,0] , data = train_data , y="Age")
axes[0,0].set_title("Age")
axes[0,0].set_ylim(0,100)
sns.boxplot(ax = axes[0,1] , data = train_data ,y="SibSp")
axes[0,1].set_title("SibSp")
axes[0,1].set_ylim(0,10)
sns.boxplot(ax=axes[1,0] ,data = train_data , y = "Parch" ,color="red")
axes[1,0].set_title("Parch")
axes[1,0].set_ylim(0,10)
sns.boxplot(ax = axes[1,1] , data = train_data , y="Fare",color = "green")
axes[1,1].set_title("Fare")
axes[1,1].set_ylim(0,500)

In [None]:
train_data["SibSp"].value_counts()
train_data.loc[train_data["SibSp"] > 4 , "SibSp" ] = 5
train_data["Parch"].value_counts()
train_data.loc[train_data["Parch"] > 2 , "Parch"] = 3

### IQR

In [None]:
check_features = ["Age" , "Fare"]
for col in check_features:
    Q1 = train_data[col].quantile(0.25)
    Q3 = train_data[col].quantile(0.75)
    IQR = Q3 - Q1
    ust_sinir = Q3 + ( 1.5 * IQR )
    alt_sinir = Q1 - ( 1.5 * IQR )

    aykiri_sayisi = len (train_data[(train_data[col] > ust_sinir) | (train_data[col] < alt_sinir)])
    print(f"---{col} Analizi---")
    print(f"Üst Sınır: {ust_sinir} Alt Sınır: {alt_sinir}")
    print(f"Aykırı Yolcu Sayısı: {aykiri_sayisi}")

train_data.loc[train_data["Age" ] > 55 , "Age"] = 55

fare_95 = train_data["Fare"].quantile(0.95)
print(len (train_data[train_data["Fare"] > fare_95] ))

train_data.loc[train_data["Fare"] > fare_95 , "Fare"] = fare_95

In [None]:
fare_is_zero = len (train_data[train_data["Fare"] == 0])
print(fare_is_zero)
train_data.loc[train_data["Fare"] == 0 , "Fare"] = train_data.groupby("Pclass")["Fare"].transform("median")
train_data.describe()

# Train Test Split

In [None]:
X = train_data[["Pclass","Age","SibSp","Parch","Fare","Sex","Embarked"]]
y = train_data["Survived"]
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25 ,random_state=42)

# Encoding

In [None]:
print(train_data.info())
print(train_data.head(20))
print(train_data["Sex"].value_counts())
print(train_data["Embarked"].unique())

X_train["Sex"] = X_train["Sex"].map({"male" :  0 , "female" : 1})
X_test["Sex"] = X_test["Sex"].map({"male":0 , "female" : 1})
 
ohe = OneHotEncoder(sparse_output = False , drop = "first")

emb_encoded_X_train = ohe.fit_transform(X_train[["Embarked"]])
emb_encoded_X_test = ohe.transform(X_test[["Embarked"]])

emb_df_X_train = pd.DataFrame (emb_encoded_X_train  , columns = ohe.get_feature_names_out(["Embarked"]) ,index = X_train.index)
emb_df_X_test = pd.DataFrame(emb_encoded_X_test , columns = ohe.get_feature_names_out(["Embarked"]),index = X_test.index) 

X_train = pd.concat([X_train.drop("Embarked" , axis = 1), emb_df_X_train],axis=1)
X_test = pd.concat([X_test.drop("Embarked" , axis = 1) , emb_df_X_test] , axis=1)

# Random Forest Model

In [None]:
rf_model = RandomForestClassifier(n_estimators = 100 , max_depth = 5 , random_state=42)
rf_model.fit(X_train , y_train)

y_pred = rf_model.predict(X_test)

# Evaluation

In [None]:
acc = accuracy_score(y_test , y_pred)
cm = confusion_matrix(y_test , y_pred)
cr = classification_report (y_test , y_pred)
print(f"Accuracy: {acc}")
print("CM:")
print(cm)
print("CR:")
print(cr)

In [None]:
train_acc = rf_model.score(X_train  ,y_train)
test_acc  = rf_model.score ( X_test , y_test)
print("Train Acc")
print(train_acc)
print("Test Acc")
print(test_acc)

# Test Dataset Preprocessing

In [None]:
test_data.drop("Cabin" , axis = 1 , inplace = True)
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mean())
test_data["Embarked"] = test_data["Embarked"].fillna(test_data["Embarked"].mode()[0])
test_data.drop("Name",axis = 1 , inplace = True)
test_data.drop("Ticket",axis = 1 , inplace = True)
pess_id = test_data["PassengerId"]
test_data.drop("PassengerId" , axis = 1 , inplace = True)
test_data.loc[test_data["SibSp"] > 4 ,"SibSp"] = 5
test_data.loc[test_data["Parch"] > 2 , "Parch"] = 3
test_data.loc[test_data["Age"] > 55 , "Age"] = 55 
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].median())
test_data.loc[test_data["Fare"] > fare_95,"Fare"] = fare_95
test_data.loc[test_data["Fare"] == 0 , "Fare"] = test_data.groupby("Pclass")["Fare"].transform("median")
test_data["Sex"] = test_data["Sex"].map({"male": 0 , "female" : 1})
emb_encoded_test = ohe.transform(test_data[["Embarked"]])
emb_df_test = pd.DataFrame(emb_encoded_test , columns = ohe.get_feature_names_out(["Embarked"]))

test_data = pd.concat([test_data.drop("Embarked" , axis = 1 ) , emb_df_test ] , axis = 1)

print(test_data.isnull().sum())
print(test_data.head(20))


# Final Predictions

In [None]:
model_columns = rf_model.feature_names_in_
X_test_final = test_data[model_columns]
final_predictions = rf_model.predict(X_test_final)

submission = pd.DataFrame({
    "PassengerId" : pess_id ,
    "Survived" : final_predictions
})

submission.to_csv("submission.csv" , index=False)