In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")

train_data.head()


In [None]:
train_data.describe()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

test_data.head()


In [None]:
print(train_data.isnull().sum())

import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Survived', data=train_data)
plt.show()

sns.barplot(x='Sex', y='Survived', data=train_data)
plt.show()

sns.barplot(x='Pclass', y='Survived', data=train_data)
plt.show()


In [None]:
numeric_data = train_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

numeric_data = pd.get_dummies(numeric_data, columns=['Sex', 'Embarked'])

numeric_data.fillna(numeric_data.median(), inplace=True) 

plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2)
for train_indices, test_indices in split.split(train_data, train_data[["Survived", "Pclass", "Sex"]]):
    strat_train_set = train_data.loc[train_indices]
    strat_test_set = train_data.loc[test_indices]


In [None]:
plt.subplot(1,2,1)
strat_train_set['Survived'].hist()
strat_train_set['Pclass'].hist()

plt.subplot(1,2,2)
strat_test_set['Survived'].hist()
strat_test_set['Pclass'].hist()

plt.show()

In [None]:
strat_train_set.info()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class AgeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="mean"):
        self.imputer = SimpleImputer(strategy=strategy)
    
    def fit(self, X, y=None):
        self.imputer.fit(X[['Age']])
        return self
    
    def transform(self, X):
        X['Age'] = self.imputer.transform(X[['Age']])
        return X


In [None]:
from sklearn.preprocessing import OneHotEncoder

class FeatureEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder_embarked = OneHotEncoder(drop='first', sparse_output=False)
        self.encoder_sex = OneHotEncoder(drop='first', sparse_output=False)
    
    def fit(self, X, y=None):
        self.encoder_embarked.fit(X[["Embarked"]])
        self.encoder_sex.fit(X[["Sex"]])
        return self
    
    def transform(self, X):
        embarked_encoded = self.encoder_embarked.transform(X[["Embarked"]])
        embarked_columns = self.encoder_embarked.get_feature_names_out(["Embarked"])
        
        sex_encoded = self.encoder_sex.transform(X[["Sex"]])
        sex_columns = self.encoder_sex.get_feature_names_out(["Sex"])
        
        X = pd.concat([X, pd.DataFrame(embarked_encoded, columns=embarked_columns, index=X.index)], axis=1)
        X = pd.concat([X, pd.DataFrame(sex_encoded, columns=sex_columns, index=X.index)], axis=1)
        
        X.drop(["Embarked", "Sex"], axis=1, inplace=True)
        
        return X



In [None]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(["Embarked", "Name", "Ticket", "Cabin", "Sex", "N"], axis = 1, errors = "ignore")

In [None]:
from sklearn.pipeline import Pipeline

strat_train_set.loc[:, 'Embarked'] = strat_train_set['Embarked'].fillna('C')
strat_test_set.loc[:, 'Embarked'] = strat_test_set['Embarked'].fillna('C')


pipeline = Pipeline([
    ("ageimputer", AgeImputer()),
    ("featureencoder", FeatureEncoder()),  
    ("featuredropper", FeatureDroper())    
])


In [None]:
strat_train_set = pipeline.fit_transform(strat_train_set)

In [None]:
strat_train_set

In [None]:
strat_train_set.info()

In [None]:
from sklearn.preprocessing import StandardScaler

X = strat_train_set.drop(['Survived'], axis = 1)
y = strat_train_set['Survived']

scaler = StandardScaler()
X_data = scaler.fit_transform(X)
y_data = y.to_numpy()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()

param_grid = [
    {"n_estimators" : [10, 100, 200, 500], "max_depth" : [None, 5, 10], "min_samples_split" : [2, 3, 4]}
]

grid_search = GridSearchCV(clf, param_grid, cv = 3, scoring = "accuracy", return_train_score = True)
grid_search.fit(X_data, y_data)

In [None]:
best_params = grid_search.best_params_

best_estimator = grid_search.best_estimator_

In [None]:
strat_test_set = pipeline.transform(strat_test_set)

In [None]:
X_test = strat_test_set.drop(['Survived'], axis=1, errors='ignore')
y_test = strat_test_set['Survived']


scaler = StandardScaler()
X_data_test = scaler.fit_transform(X_test)
y_data_test = y_test.to_numpy()



In [None]:
best_estimator.score(X_data_test, y_data_test)

In [None]:
train_data.loc[:, 'Embarked'] = train_data['Embarked'].fillna('C')
test_data.loc[:, 'Embarked'] = test_data['Embarked'].fillna('C')

final_data = pipeline.fit_transform(train_data)

In [None]:
final_data

In [None]:
X_test = final_data.drop(['Survived'], axis=1, errors='ignore')
y_test = final_data['Survived']


scaler = StandardScaler()
X_data_final = scaler.fit_transform(X_test)
y_data_final = y_test.to_numpy()



In [None]:
final_clf = RandomForestClassifier()

param_grid = [
    {"n_estimators" : [10, 100, 200, 500], "max_depth" : [None, 5, 10], "min_samples_split" : [2, 3, 4]}
]

grid_search = GridSearchCV(final_clf, param_grid, cv = 3, scoring = "accuracy", return_train_score = True)
grid_search.fit(X_data_final, y_data_final)

In [None]:
final_final_clf = grid_search.best_estimator_
final_final_clf

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

test_data

In [None]:
final_test_data = pipeline.fit_transform(test_data)
final_test_data.info()

In [None]:
X_final_test = final_test_data
X_final_test = X_final_test.ffill()

scaler= StandardScaler()
X_data_final_test = scaler.fit_transform(X_final_test)

In [None]:
predictions = final_final_clf.predict(X_data_final_test)
predictions

In [None]:
final_df = pd.DataFrame(test_data['PassengerId'])
final_df['Survived'] = predictions
final_df.to_csv("/kaggle/working/predict.csv", index = False)

In [None]:
final_df