In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_file = "/kaggle/input/titanic/train.csv"
train = pd.read_csv(train_file)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
train.shape

(891, 12)

In [5]:
numerical_features = ['Age', 'Fare']
categorical_features = ['Sex', 'Embarked']
drop_features = ['Cabin', 'Ticket', 'Name']

In [6]:
#fills the 2 null values in Embarked
train.Embarked.fillna(train.Embarked.mode())
train.shape

#replaces null values in Age with the average age of the dataset
train.Age.fillna(train.Age.mean(), inplace=True)

#separating features and targets
X_train = train.drop(['Survived'], axis=1)
y_train = train['Survived']
X_train.shape, y_train.shape

((891, 11), (891,))

In [7]:
#Function that transforms the dataset
oe = OrdinalEncoder()
std_s = StandardScaler()
X_train[['Sex', 'Embarked']] = oe.fit_transform(X_train[['Sex', 'Embarked']])
X_train[['Age', 'Fare']] = std_s.fit_transform(X_train[['Age', 'Fare']])
#X_train = pd.DataFrame(X_train)
X_train.drop(drop_features, axis=1, inplace=True)
#X_train.head()
X_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1.0,-0.592481,1,0,-0.502445,2.0
1,2,1,0.0,0.638789,1,0,0.786845,0.0
2,3,3,0.0,-0.284663,0,0,-0.488854,2.0
3,4,1,0.0,0.407926,1,0,0.42073,2.0
4,5,3,1.0,0.407926,0,0,-0.486337,2.0


In [8]:

#model = XGBClassifier(objective='binary:logistic', seed=42, eval_metric='aucpr')
final_model = XGBClassifier(n_estimators=60,subsample=0.8,colsample_bylevel = 0.8,max_depth = 4, gamma=1, alpha=1, reg_lambda=0, learning_rate=0.03)
#final_model has best parameters found by gridsearch
params_grid = {
'n_estimators': [60],
'max_depth': [4],
'subsample': [0.8],
'colsample_bytree': [0.8],
'gamma': [1, 5, 10, 15],
'learning_rate': [0.03],
'lambda': [0, 0.001],
'alpha': [0, 0.001, 0.003, 0.01, 0.1, 1]}


In [9]:
#grid = GridSearchCV(model, params_grid, cv=5, scoring='accuracy')
#grid.fit(X_train, y_train)
final_model.fit(X_train, y_train)
#train_predictions = grid.predict(X_train)
#accuracy_score(y_train, train_predictions)


XGBClassifier(alpha=1, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=0.8, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.03, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=60, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=1, ...)

In [10]:
#grid.best_params_

In [11]:
test_file = "/kaggle/input/titanic/test.csv"
test = pd.read_csv(test_file)
test.Age.fillna(train.Age.median(), inplace=True)
test[['Sex', 'Embarked']] = oe.fit_transform(test[['Sex', 'Embarked']])
test[['Age', 'Fare']] = std_s.fit_transform(test[['Age', 'Fare']])
test.drop(drop_features, axis=1, inplace=True)
test.isna().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64

In [12]:
#predictions = grid.best_estimator_.predict(test)
predictions = final_model.predict(test)

In [13]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)