# Data Preparation

Creating vectors that represent the entity.

You should do something more in here.

In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('./data/train.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [2]:
test_df = pd.read_csv('./data/test.csv')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [None]:
submission_df = pd.read_csv('./data/gender_submission.csv')
submission_df

In [3]:
from preprocess import get_X, get_y

X_trn = get_X(train_df)
y_trn = get_y(train_df)

X_tst = get_X(test_df)

X_trn.shape, y_trn.shape, X_tst.shape

((891, 5), (891,), (418, 5))

In [5]:
X_trn

array([[3., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [3., 0., 0., 1., 0.],
       ...,
       [3., 1., 2., 1., 0.],
       [1., 0., 0., 0., 1.],
       [3., 0., 0., 0., 1.]], dtype=float32)

# Models

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
import warnings
warnings.simplefilter("ignore")

In [None]:
scoring = ['accuracy', 'precision', 'recall', 'f1']

clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=2023)
scores = cross_validate(clf, X_trn, y_trn, scoring=scoring, cv=5)

scores_df = pd.DataFrame(scores)
pd.concat([scores_df, scores_df.apply(['mean', 'std'])])

In [None]:
# Change `max_depth`
clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2023)
scores = cross_validate(clf, X_trn, y_trn, scoring=scoring, cv=5)

scores_df = pd.DataFrame(scores)
pd.concat([scores_df, scores_df.apply(['mean', 'std'])])

* RandomForestClassifier Test

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2023)
clf.fit(X_trn, y_trn)
pred = clf.predict(X_tst)
pred.shape

In [None]:
submission_df['Survived'] = pred
submission_df.to_csv('./submission_rfc.csv', index=False)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
scoring = ['accuracy', 'precision', 'recall', 'f1']
reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=2023)
scores = cross_validate(reg, X_trn, y_trn, scoring=scoring, cv=5)

scores_df = pd.DataFrame(scores)
pd.concat([scores_df, scores_df.apply(['mean', 'std'])]) # NaN

In [None]:
from rf import CustomRF

In [None]:
scoring = ['accuracy', 'precision', 'recall', 'f1']
reg = CustomRF(n_estimators=200, max_depth=50, random_state=2023, threshold=0.5)
scores = cross_validate(reg, X_trn, y_trn, scoring=scoring, cv=5)

scores_df = pd.DataFrame(scores)
pd.concat([scores_df, scores_df.apply(['mean', 'std'])])

* RandomForestRegressor Test

In [None]:
reg = CustomRF(n_estimators=200, max_depth=50, random_state=2023)
reg.fit(X_trn, y_trn)
pred = reg.predict(X_tst)

submission_df['Survived'] = pred
submission_df.to_csv('./submission_rfr.csv', index=False)

## Neuarl Network

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device

In [None]:
from nn import ANN
from utils import CustomDataset
from train import train_one_epoch, evaluate
from torchmetrics.classification import BinaryConfusionMatrix, BinaryAccuracy

In [None]:
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm
from torch.utils.data import TensorDataset


n_splits = 5

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2023)

nets = [ANN().to(device) for i in range(n_splits)]
history = []

for i, (trn_idx, val_idx) in enumerate(skf.split(X_trn, y_trn)):
  X, y = torch.tensor(X_trn[trn_idx]), torch.tensor(y_trn[trn_idx]).unsqueeze(-1)
  X_val, y_val = torch.tensor(X_trn[val_idx]), torch.tensor(y_trn[val_idx]).unsqueeze(-1)

  # ds = TensorDataset(X, y)
  # ds_val = TensorDataset(X_val, y_val)
  ds = CustomDataset(X, y)
  ds_val = CustomDataset(X_val, y_val)
  dl = DataLoader(ds, batch_size=32, shuffle=True)
  dl_val = DataLoader(ds_val, batch_size=len(ds_val), shuffle=False)

  net = nets[i]
  optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)

  pbar = tqdm(range(300))
  for j in pbar:
    accuracy = BinaryAccuracy().to(device)
    loss = train_one_epoch(net, nn.functional.binary_cross_entropy, optimizer, dl, device)
    loss_val = evaluate(net, nn.functional.binary_cross_entropy, dl_val, device, accuracy)
    acc_val = accuracy.compute().item()
    pbar.set_postfix(trn_loss=loss, val_loss=loss_val, val_acc=acc_val)

  bcm = BinaryConfusionMatrix().to(device)
  evaluate(net, nn.functional.binary_cross_entropy, dl_val, device, bcm)
  history.append(bcm)

In [None]:
from metric import cm_to_metrics

scores = {
  'accuracy': [],
  'precision': [],
  'recall': [],
  'f1': []
}

for bcm in history:
  bcm.plot()
  cm = bcm.compute().cpu().numpy()

  accuracy, precision, recall, f1 = cm_to_metrics(cm)

  scores['accuracy'].append(accuracy)
  scores['precision'].append(precision)
  scores['recall'].append(recall)
  scores['f1'].append(f1)

scores_df = pd.DataFrame(scores)
pd.concat([scores_df, scores_df.apply(['mean', 'std'])])

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

cm = sum([bcm.compute().cpu().numpy() for bcm in history])
ConfusionMatrixDisplay(cm).plot()
plt.show()

accuracy, precision, recall, f1 = cm_to_metrics(cm)

pd.DataFrame({'accuracy': [accuracy], 'precision': [precision], 'recall': [recall], 'f1': [f1]})

* Neural Net Test

In [None]:
X, y = torch.tensor(X_trn), torch.tensor(y_trn).unsqueeze(-1)

ds = TensorDataset(X, y)
# ds = CustomDataset(X, y)
dl = DataLoader(ds, batch_size=32, shuffle=True)

net = ANN().to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)

pbar = tqdm(range(300))
for j in pbar:
  loss = train_one_epoch(net, nn.functional.binary_cross_entropy, optimizer, dl, device)
  pbar.set_postfix(trn_loss=loss)

In [None]:
pred = net(torch.tensor(X_tst, device=device)).cpu()
pred = (pred > 0.5).float().flatten()

submission_df['Survived'] = pred
submission_df.to_csv('./submission_nn.csv', index=False)