In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [3]:
#importing data
titanic_train = pd.read_csv('/kaggle/input/titanic/train.csv')
titanic_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [4]:
titanic_train.head(), len(titanic_train)

(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

In [5]:
# Remove rows with missing target, separate target from predictors
titanic_train.dropna(axis=0, subset=['Survived'])
y = titanic_train.Survived              
X = titanic_train.drop(['Survived'], axis=1)

In [6]:
#Age has 177 missing values
#Cabin has 687 missing values but it dropped due to high cardinality and large number of missing values
#Embarked has 2 missing values and they are replaced by the mode:
X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode())
titanic_test['Embarked'] = titanic_test['Embarked'].fillna(titanic_test['Embarked'].mode())

#Encode these and add the fact that values were missing
#X['Age'] = X['Age'].fillna(0)
#titanic_test['Age'] = titanic_test['Age'].fillna(0)

In [7]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [8]:
# Categorical columns in the training data
object_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == "object"]

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Columns dropped
dropped_cols = object_cols.copy()
for i in range(0, len(low_cardinality_cols)):
    dropped_cols.remove(low_cardinality_cols[i])

print("Object columns:" + str(object_cols))
print("Dropped columns:" + str(dropped_cols))
print("One-hot-encoded columns:" + str(low_cardinality_cols))
print("Numberic columns:" + str(numeric_cols))

Object columns:['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
Dropped columns:['Name', 'Ticket', 'Cabin']
One-hot-encoded columns:['Sex', 'Embarked']
Numberic columns:['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [9]:
# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = titanic_test[my_cols].copy()

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))


# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train[numeric_cols]
num_X_valid = X_valid[numeric_cols]
num_X_test = X_test[numeric_cols]

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [11]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy='constant')
OH_imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
OH_imputed_X_valid = pd.DataFrame(my_imputer.transform(OH_X_valid))
OH_imputed_X_test = pd.DataFrame(my_imputer.transform(OH_X_test))

# Imputation removed column names; put them back
OH_imputed_X_train.columns = OH_X_train.columns
OH_imputed_X_valid.columns = OH_X_valid.columns
OH_imputed_X_test.columns = OH_X_test.columns



In [12]:
OH_imputed_X_train[OH_imputed_X_train['Age'].isna()]

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4,5


In [13]:
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error

# Define the model
model = XGBClassifier(random_state = 0, n_estimators = 600, learning_rate=0.02, n_jobs=4) # Your code here

# Fit the model
model.fit(OH_X_train, y_train) # Your code here

# Get predictions
predictions = model.predict(OH_X_valid) # Your code here

# Calculate MAE
mae = mean_absolute_error(y_valid, predictions) # Your code here

# Uncomment to print MAE
print("Mean Absolute Error:" , mae)

Mean Absolute Error: 0.1452513966480447


In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_valid, predictions))

0.8547486033519553


In [15]:
predictions_test = model.predict(OH_X_test)

In [16]:
output = pd.DataFrame({'PassengerId': OH_X_test.PassengerId, 'Survived': predictions_test})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
