In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# Importing data

In [None]:
df_test = pd.read_csv('../input/titanic/test.csv')
df_train = pd.read_csv('../input/titanic/train.csv')

# Exploring data

In [None]:
df_test.info()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_plot = df_train[(df_train.Survived == 1)]

## How many people died according their gender?

In [None]:
sns.countplot(x= df_plot['Sex'])
plt.title('Survived per Sex')


## What class had higher number of dead?

In [None]:
sns.countplot(x= df_train.Pclass)
plt.title('Survived per Pclass')

## How many Siblings and spouses had people who died?

In [None]:
sns.countplot(x= df_train.SibSp)
plt.title('Survived per Siblings and Spouses amount')

## How many Parents and Children had people who died?

In [None]:
sns.countplot(x= df_train.Parch)
plt.title('Survived per Parch')

## What Age had higher number of dead?

In [None]:
sns.displot(df_plot.Age)
plt.axvline(df_plot.Age.mean(), color='red')
plt.legend([f'mean = {df_plot.Age.mean():.1f}'])
plt.xticks(rotation=90)
plt.title('Distribution of Survived per Age')
plt.show()

## Spliting data

In [None]:
x = df_train.drop('Survived', axis=1)
y = df_train.Survived 

In [None]:
x_train,x_valid,y_train, y_valid = train_test_split(x, y, test_size= 0.25, random_state = 0)

## Categorical and numerical Columns

In [None]:
cat_cols_train = [cat_col for cat_col in x_train.columns if x_train[cat_col].dtype == 'object']
num_cols_train = [num_col for num_col in x_train.columns if x_train[num_col].dtype == 'int64' and 
           x_train[num_col].dtype == 'float64']

In [None]:
cat_cols_test = [cat_col for cat_col in df_test.columns if x_valid[cat_col].dtype == 'object']
num_cols_test = [num_col for num_col in df_test.columns if x_valid[num_col].dtype == 'int64' and 
           df_test[num_col].dtype == 'float64']

## Preprocessing for numerical and categorical data

In [None]:
#Numerical
numerical_transformer = SimpleImputer(strategy = 'most_frequent')

#Categorical
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')),
                 ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

#Bundle preprocessing for numerical and categorical data

preprocessor = ColumnTransformer(transformers=[('num',numerical_transformer,num_cols_train),
                                               ('cat',categorical_transformer, cat_cols_train)])

#Define model
model = XGBClassifier(n_estimators = 100, use_label_encoder= False)

#Bundle Preprocessing and modeling code in a Pipeline

pipe = Pipeline(steps = [('preprocessor',preprocessor),('model',model)])


In [None]:
pipe.fit(x_train, y_train)

In [None]:
preds = pipe.predict(x_valid)

In [None]:
print(f'Accuracy score: {accuracy_score(y_valid, preds):.2f}\nF1 score:\t{f1_score(y_valid, preds):.2f}')

In [None]:
pred_submission = pipe.predict(df_test)

In [None]:
df_sub = pd.read_csv('../input/titanic/gender_submission.csv')
df_sub

In [None]:
df_sub['Survived']= pred_submission.round()
df_sub = df_sub.set_index('PassengerId')
df_sub.to_csv('submission.csv')
df_sub