# Process Data
The processing consists of three steps:
1. Remove unnecessary columns.
2. Process data: fill missing ages, normalize ages and Pclass between 0 and 1, and encode Sex as 0 and 1.
3. Shuffle and split data for training and validation.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
def ProcessData(data):
    processed_data = data[["Pclass", "Sex", "Age"]].copy() # keep only the necessary columns

    processed_data['Age'] = processed_data['Age'].fillna(processed_data['Age'].median()) # Fill missing ages with the median
    processed_data['Age'] = (processed_data['Age']) / 100 # Normalize age to [0, 1]
    processed_data['Pclass'] = (processed_data['Pclass'] - 1) / 2 # Normalize passenger class to [0, 1]
    processed_data['Sex'] = LabelEncoder().fit_transform(processed_data['Sex']) # Encode gender data to 0 and 1

    return processed_data

In [3]:
#  preprocess the data
original = pd.read_csv("data/original.csv")
original_test = pd.read_csv("data/original_test.csv")

processed_data_labels = original['Survived']
processed_data = ProcessData(original)
processed_test_data = ProcessData(original_test)
print("Original shape:", original.shape, "\nProcessed shape:", processed_data.shape)
print("Original test shape:", original_test.shape, "\nProcessed test shape:", processed_test_data.shape)

# save the processed data
processed_data.to_csv("data/processed_data.csv", index=False)
processed_data_labels.to_csv("data/processed_data_labels.csv", index=False)
processed_test_data.to_csv("data/processed_test_data.csv", index=False)

# split data into training and testing sets
train_data, train_test, train_data_labels, train_test_labels = train_test_split(processed_data, processed_data_labels, test_size=0.12, random_state=12, shuffle=True)
print("Train data shape:", train_data.shape, train_data_labels.shape)
print("Train test shape:", train_test.shape, train_test_labels.shape)

# save data for training
train_data.to_csv("data/train_data.csv", index=False)
train_data_labels.to_csv("data/train_data_labels.csv", index=False)
train_test.to_csv("data/train_test.csv", index=False)
train_test_labels.to_csv("data/train_test_labels.csv", index=False)


Original shape: (891, 12) 
Processed shape: (891, 3)
Original test shape: (418, 11) 
Processed test shape: (418, 3)
Train data shape: (784, 3) (784,)
Train test shape: (107, 3) (107,)
