In [1]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier # classifier
from sklearn.model_selection import GridSearchCV # hyperparam tuning for classifier
from kaggle_secrets import UserSecretsClient # access git

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("github_access_token_for_kaggle")

os.environ['USER'] = 'alexlinkportfolio'
os.environ['REPO'] = 'project_titanic'
os.environ['TOKEN'] = secret_value_0


!git clone https://${TOKEN}@github.com/${USER}/${REPO}.git

%cd ./{os.environ['REPO']}
!git checkout main
!git status

%ls -al

!git config --global user.email "alex.link@utexas.edu"
!git config --global user.name "Alex Link"
!git config --global --list

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
Cloning into 'project_titanic'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
/kaggle/working/project_titanic
Already on 'main'
Your branch is up to date with 'origin/main'.
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
total 16
drwxr-xr-x 3 root root 4096 May  7 17:03 [0m[01;34m.[0m/
drwxr-xr-x 3 root root 4096 May  7 17:03 [01;34m..[0m/
drwxr-xr-x 8 root root 4096 May  7 17:03 [01;34m.git[0m/
-rw-r--r-- 1 root root   60 May  7 17:03 README.md
user.email=alex.link@utexas.edu
user.name=Alex Link


In [2]:
# import data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
"""
data manipulation /  feature engineering

current:

future work:
    - cabins numbers are unique and should be converted to location groups
"""

# convert datatype of particular columns to categorical
train_data.Pclass = train_data.Pclass.astype('category')
train_data.Sex = train_data.Sex.astype('category')
train_data.Embarked = train_data.Embarked.astype('category')

test_data.Pclass = test_data.Pclass.astype('category')
test_data.Sex = test_data.Sex.astype('category')
test_data.Embarked = test_data.Embarked.astype('category')

# convert certain numerical data to categorical
def conv_to_buckets(data):
    min_data = min(data)
    max_data = max(data)
    med_data = np.nanmedian(data)
    bins = [min_data, min_data+(med_data-min_data)/2, med_data,med_data+(max_data-med_data)/2, max_data]
    labels = ["low","med_low","med_high","high"]
    return bins, labels

age_bins, age_labels = conv_to_buckets(train_data.Age)
fare_bins, fare_labels = conv_to_buckets(train_data.Fare)

train_data.Age = pd.cut(train_data.Age, bins=age_bins, labels=age_labels, right=False)
train_data.Fare = pd.cut(train_data.Fare, bins=fare_bins, labels=fare_labels, right=False)

test_data.Age = pd.cut(test_data.Age, bins=age_bins, labels=age_labels, right=False)
test_data.Fare = pd.cut(test_data.Fare, bins=fare_bins, labels=fare_labels, right=False)

In [4]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [5]:
# Hyperparameter Tuning

# # define param grid
# param_grid = {
#     "n_estimators":[25,50,75,100,125,150, 175,200],
#     "max_depth":[5,10,15,20]
# }

# # instantiate rf model
# model = RandomForestClassifier(
#     n_estimators=100, # why 100?
#     max_depth=5,      # why 5?
#     random_state=1)


# # outcome
# target = train_data["Survived"]

# # list features of interest
# features = ["Pclass", "Sex","Embarked","Age","SibSp", "Parch", "Fare"]

# # create 1-hot predictor encoding for train and test data
# train_predictors = pd.get_dummies(train_data[features])
# test_predictors = pd.get_dummies(test_data[features])



# grid_search = GridSearchCV(model, param_grid=param_grid)
# grid_search.fit(train_predictors, target)


# report(grid_search.cv_results_)

In [6]:
# outcome
target = train_data["Survived"]

# list features of interest
features = ["Pclass", "Sex","Embarked","Age","SibSp", "Parch", "Fare"]

# create 1-hot predictor encoding for train and test data
train_predictors = pd.get_dummies(train_data[features])
test_predictors = pd.get_dummies(test_data[features])

# instantiate rf model
model = RandomForestClassifier(
    n_estimators=175, # why 175: best mean acccuracy w/ grid search cv
    max_depth=5,      # why 5: best mean acccuracy w/ grid search cv
    random_state=1)

# fit model using training data
model.fit(train_predictors, target)

# predict outcome of test data using fitted model
predictions = model.predict(test_predictors)

# package for output
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                      'Survived': predictions})

# send to csv
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [7]:
# """
# cat: Pclass, Sex, Cabin, Embarked
# num: Age, SibSp, Parch, Fare

# """

# # outcome
# target = train_data["Survived"]

# # list features of interest
# features = ["Pclass", "Sex","Embarked","Age","SibSp", "Parch", "Fare"]

# # create 1-hot predictor encoding for train and test data
# train_predictors = pd.get_dummies(train_data[features])
# test_predictors = pd.get_dummies(test_data[features])

# # instantiate rf model
# model = RandomForestClassifier(
#     n_estimators=100, # why 100?
#     max_depth=5,      # why 5?
#     random_state=1)

# # fit model using training data
# model.fit(train_predictors, target)

# # predict outcome of test data using fitted model
# predictions = model.predict(test_predictors)

In [8]:
# package for output
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                      'Survived': predictions})

# send to csv
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
