In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240514%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240514T014830Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1f7050de5d6fce895d06dd625f32714c1ef041cf1b9b6c3bb178b159c3455de8fa32785faa25a9dbec0ee07f91c9ca2522964d7adc1978e775f8012cf4625e88cbdcd5fd42215e0e00a3dabcc961f6aa7d17ec4e8983f301c1a0cd96577b0c0ebb86de3e28526712f121dc4926cb4d290936d7c43f321b2986e9120e2d279449eec1acc516ede64e0550e27e261ba2966f00b7a7f781050d2ba5f0497cfb6c4068e83cb84db0a5ca4871e6b354e196bd09f6ffb2479edd8d6c0d060d75b94461def0b04792c3d7597713b93e981adda4d22880f91449307af3321048eb4fdd31fdcf414a5aa8a6c6462b5765d4fe2d8870357835c7d58ab4b894d675697e7266'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading titanic, 34877 bytes compressed
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train_data.describe(include = 'object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [7]:
train_data.shape

(891, 12)

In [None]:
train_data.Embarked.unique()

In [9]:
#dir(train_data)
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [10]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [None]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)
print(f"% of women who survived:{rate_women*100:.2f}")

In [None]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print(f"% of men who survived:{rate_men*100:.2f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('submission.csv', index=False)
#print("Your submission was successfully saved!")

In [None]:
model.score(X, y)

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures

In [None]:
train_data.Cabin.unique()

In [None]:
train_data.Pclass.describe()

In [None]:
sns.color_palette('Set3')
sns.histplot(data = train_data, x = 'Pclass', hue = 'Survived', multiple = 'stack')

In [None]:
features = train_data[['Pclass','Age','SibSp','Parch', 'Fare', 'Survived']]
features.corr()

# Dropping Missing Values
First Drop missing values in the training data for model construction. 

## Missing Values
First, define a function to print missing values

In [12]:
def print_missing(data):
    columns = data.columns
    for column  in columns:
        print(f'Number of NAs in {column} is {sum(pd.isna(data[column]))}')

In [13]:
print_missing(train_data)

Number of NAs in PassengerId is 0
Number of NAs in Survived is 0
Number of NAs in Pclass is 0
Number of NAs in Name is 0
Number of NAs in Sex is 0
Number of NAs in Age is 177
Number of NAs in SibSp is 0
Number of NAs in Parch is 0
Number of NAs in Ticket is 0
Number of NAs in Fare is 0
Number of NAs in Cabin is 687
Number of NAs in Embarked is 2


2 rows in embarked are fare enough to drop. Cabin will not be selected for the Model construction. Age missing values will be treated after data splitting to prevent data leaking

In [14]:
train_data.dropna(subset = 'Embarked', axis = 0, inplace = True)

# Data splitting


In [15]:
features = train_data[['Pclass', 'Sex','Age','SibSp','Parch', 'Fare', 'Embarked', 'Survived']]
f_test = test_data[['PassengerId','Pclass', 'Sex','Age','SibSp','Parch', 'Fare', 'Embarked']]

X = features.iloc[:,0:7]
y = features.iloc[:,7]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
y_train.shape, x_train.shape, y_test.shape, x_test.shape

((711,), (711, 7), (178,), (178, 7))

# Data Cleaning

In [16]:
print_missing(x_train)

Number of NAs in Pclass is 0
Number of NAs in Sex is 0
Number of NAs in Age is 145
Number of NAs in SibSp is 0
Number of NAs in Parch is 0
Number of NAs in Fare is 0
Number of NAs in Embarked is 0


In [17]:
print_missing(f_test)

Number of NAs in PassengerId is 0
Number of NAs in Pclass is 0
Number of NAs in Sex is 0
Number of NAs in Age is 86
Number of NAs in SibSp is 0
Number of NAs in Parch is 0
Number of NAs in Fare is 1
Number of NAs in Embarked is 0


Only one missing value in training data to drop in column Embarked

In [18]:
print_missing(x_test)

Number of NAs in Pclass is 0
Number of NAs in Sex is 0
Number of NAs in Age is 32
Number of NAs in SibSp is 0
Number of NAs in Parch is 0
Number of NAs in Fare is 0
Number of NAs in Embarked is 0


In [19]:
mean_age = np.floor(x_train['Age'].mean())
print(mean_age)
x_train['Age'].replace(np.nan, mean_age, inplace = True)

mean_age_test = np.floor(x_test['Age'].mean())
print(mean_age_test)
x_test['Age'].replace(np.nan, mean_age_test, inplace = True)

mean_age_test_set = np.floor(f_test['Age'].mean())
print(mean_age_test_set)
f_test['Age'].replace(np.nan, mean_age_test_set, inplace = True)

29.0
30.0
30.0


In [20]:
print_missing(x_train)

Number of NAs in Pclass is 0
Number of NAs in Sex is 0
Number of NAs in Age is 0
Number of NAs in SibSp is 0
Number of NAs in Parch is 0
Number of NAs in Fare is 0
Number of NAs in Embarked is 0


In [21]:
print_missing(x_test)

Number of NAs in Pclass is 0
Number of NAs in Sex is 0
Number of NAs in Age is 0
Number of NAs in SibSp is 0
Number of NAs in Parch is 0
Number of NAs in Fare is 0
Number of NAs in Embarked is 0


The Fare missing value in the f_test set will be replaced by the mean in the training set

In [22]:
mean_fare = np.floor(f_test['Fare'].mean())
print(mean_fare)
f_test['Fare'].replace(np.nan, mean_fare, inplace = True)

35.0


In [23]:
print_missing(f_test)

Number of NAs in PassengerId is 0
Number of NAs in Pclass is 0
Number of NAs in Sex is 0
Number of NAs in Age is 0
Number of NAs in SibSp is 0
Number of NAs in Parch is 0
Number of NAs in Fare is 0
Number of NAs in Embarked is 0


# Data Encoding
## Ordinal Encoding
To prevent leaking, encoding is done after splitting and in the training and test data, separetely. The features to encode are the object data types: Sex and Embarked.

In [32]:
import sklearn.preprocessing as preprocessing

enc = preprocessing.OrdinalEncoder()
enc_X= enc.fit_transform(x_train.select_dtypes(include = object))
X_enc =  np.concatenate((enc_X, x_train.select_dtypes(include = [int, float])), axis =1)

enc_X_t = enc.transform(x_test.select_dtypes(include = object))
X_test_enc = np.concatenate((enc_X_t, x_test.select_dtypes(include = [int, float])), axis = 1)

# Normalization

In [33]:
norm = preprocessing.StandardScaler()
norm.fit(X_enc)
X_train_norm = norm.transform(X_enc)
X_test_norm = norm.transform(X_test_enc)

# Evaluation and model functions

In [34]:
#from 2019 project

import sklearn.ensemble
import sklearn.model_selection

from sklearn.ensemble import AdaBoostClassifier

def ada_boost_classifier(x_train, y_train, no_estimators, fold):
    ada_boost = sklearn.ensemble.AdaBoostClassifier(random_state=0)
    params = {
        "n_estimators": range(min(no_estimators,10), no_estimators+1),
        "algorithm": ['SAMME', 'SAMME.R']
    }

    grid_cv = GridSearchCV(ada_boost, param_grid=params, verbose=0,cv=fold)

    print("Training Ada Boost ...")
    grid_cv.fit(x_train, y_train)

    return grid_cv.best_estimator_, grid_cv.best_params_


In [30]:
import json
ACCURACY = "accuracy"
PRECISION = "precision"
RECALL = "recall"
F1 = "f1"
HYPERPARAM = "hyperparam"


def evaluate_classifier(x_train, x_test, y_train,y_test, model, params, dataset_name, model_name):
    y_pred = model.predict(x_test)

    train_accuracy = model.score(x_train, y_train)
    test_accuracy = sklearn.metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    #test_accuracy =model.score(x_test, y_test)
    precision = sklearn.metrics.precision_score(y_true=y_test, y_pred=y_pred, average = 'macro')
    recall = sklearn.metrics.recall_score(y_true=y_test, y_pred=y_pred, average = 'macro')

    print("--{0}:".format(model_name))
    print("\tTraining accuracy: {1:.2f}%".format(dataset_name, train_accuracy * 100))
    print("\tTesting accuracy: {1:.2f}%".format(dataset_name, test_accuracy * 100))
    print("\tPrecision: {1:.2f}%".format(dataset_name, precision * 100))
    print("\tRecall: {1:.2f}%".format(dataset_name, recall * 100))
    if params is not None and len(params.keys()) > 0:
        print("\tHyperparam:")
        for hyperparam in params.keys():
            print("\t\t {0}: {1}".format(hyperparam, params[hyperparam]))

    return {
        ACCURACY: test_accuracy,
        PRECISION: precision,
        RECALL: recall,
        HYPERPARAM: params
    }

def export_result(result, filename):
    with open(filename, 'w') as outfile:
        json.dump(result, outfile)


# Adaboost Classifier

In [36]:
#Segundo resultado corrigiendo error al escoger X y y
#tercer resultado cambiando a Grid Search en vez de random search
result = {}
ADABOOST = "ada_boost"
rf_best_model, rf_params = ada_boost_classifier(X_train_norm, y_train, no_estimators=50, fold=5)
result[ADABOOST] = evaluate_classifier(X_train_norm, X_test_norm, y_train, y_test, rf_best_model, rf_params,
                                                    "Titanic", "AdaBoost")

export_result(result, "titanic.json")

Training Ada Boost ...
--AdaBoost:
	Training accuracy: 81.58%
	Testing accuracy: 80.90%
	Precision: 79.64%
	Recall: 81.09%
	Hyperparam:
		 algorithm: SAMME.R
		 n_estimators: 23


# Random Forest

In [37]:
def random_forest(x_train, y_train, max_estimator=100, max_depth = 5, fold=10, iterations=20):
    rf = sklearn.ensemble.RandomForestClassifier(random_state=0, max_depth = max_depth)
    params = {
        "n_estimators": range(1, max_estimator + 1)
    }

    random_search_cv = sklearn.model_selection.RandomizedSearchCV(rf, param_distributions=params, verbose=0, cv=fold,
                                                                  random_state=0, n_iter=min(max_estimator, iterations))

    print("Training random forest classifier ...")
    random_search_cv.fit(x_train, y_train)

    return random_search_cv.best_estimator_, random_search_cv.best_params_

In [38]:
RANDOM_FOREST = "random_forest"

rf_best_model, rf_params = random_forest(X_train_norm, y_train, max_estimator = 50, max_depth = 5, fold=5,
                                                               iterations=20)
result[RANDOM_FOREST] = evaluate_classifier(X_train_norm, X_test_norm, y_train, y_test, rf_best_model, rf_params,
                                                         "Titanic", "RANDOM FOREST")

export_result(result, "titanic.json")

Training random forest classifier ...
--RaNDOM FOREST:
	Training accuracy: 84.67%
	Testing accuracy: 84.83%
	Precision: 84.15%
	Recall: 82.97%
	Hyperparam:
		 n_estimators: 15


## Decision Tree with GridSearchCV

In [44]:
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['log2', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10],
     'random_state': [2*n for n in range(5,10)]}

def decision_tree(x_train, y_train, parameters, fold=5):
    dt = sklearn.tree.DecisionTreeClassifier()
    params = parameters
    DT_cv = GridSearchCV(dt, param_grid=params, verbose=0, cv=fold)
    print("Training Decisioin Tree classifier ...")
    DT_cv.fit(x_train, y_train)

    return DT_cv.best_estimator_, DT_cv.best_params_

In [45]:
DECISION_TREE = 'Decision Tree'
dt_model, dt_param = decision_tree(X_train_norm, y_train, parameters, fold = 10)
result[DECISION_TREE] = evaluate_classifier(X_train_norm, X_test_norm, y_train, y_test, dt_model, dt_param,
                                                         "Titanic", "DECISION TREE")


Training Decisioin Tree classifier ...
--DECISION TREE:
	Training accuracy: 85.65%
	Testing accuracy: 85.39%
	Precision: 84.24%
	Recall: 85.90%
	Hyperparam:
		 criterion: entropy
		 max_depth: 12
		 max_features: log2
		 min_samples_leaf: 1
		 min_samples_split: 5
		 random_state: 16
		 splitter: random


In [49]:
from sklearn.metrics import classification_report
yhat = dt_model.predict(X_test_norm)
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88       112
           1       0.76      0.88      0.82        66

    accuracy                           0.85       178
   macro avg       0.84      0.86      0.85       178
weighted avg       0.86      0.85      0.86       178



In [50]:
yhat = rf_best_model.predict(X_test_norm)
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       112
           1       0.82      0.76      0.79        66

    accuracy                           0.85       178
   macro avg       0.84      0.83      0.83       178
weighted avg       0.85      0.85      0.85       178

