In [None]:
import pandas as pd

# Load the datasets
test_path = 'user_data/20250405_114119-38a81897-6191-4924-854a-e4fc9cc3ba8d/test.csv'
train_path = 'user_data/20250405_114119-38a81897-6191-4924-854a-e4fc9cc3ba8d/train.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Display the first few rows of the training dataset
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Exploring the training dataset
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# Preprocessing the data

# Handling missing values

# Filling missing 'Age' values with the median value
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Filling missing 'Embarked' values with the most common value
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Dropping 'Cabin' due to too many missing values
train_data.drop(columns=['Cabin'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)


In [None]:
# Preprocessing the data

# Handling missing values

# Filling missing 'Age' values with the median value
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())

# Filling missing 'Embarked' values with the most common value
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])

# Dropping 'Cabin' due to too many missing values
train_data = train_data.drop(columns=['Cabin'])

KeyError: "['Cabin'] not found in axis"

In [None]:
# Dropping 'Cabin' since it has too many missing values
if 'Cabin' in train_data.columns:
    train_data = train_data.drop(columns=['Cabin'])

In [None]:
# Feature engineering: Extracting titles from names
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.')

# Display the first few rows to verify the extracted titles
train_data[['Name', 'Title']].head()

Unnamed: 0,Name,Title
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr


In [None]:
# Encoding categorical variables into numerical format

df = train_data.copy()

df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot encoding for 'Embarked' and 'Title'
df = pd.get_dummies(df, columns=['Embarked', 'Title'], drop_first=True)

In [None]:
# Splitting the training dataset into training and validation sets
from sklearn.model_selection import train_test_split

# Defining features and target variable
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'PassengerId'])
y = df['Survived']

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Choosing appropriate machine learning algorithms for training
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initializing the model
model = RandomForestClassifier(random_state=42)

In [None]:
# Training the model and validating its performance
model.fit(X_train, y_train)

# Making predictions on the validation set
y_pred = model.predict(X_val)

# Evaluating the model's accuracy
accuracy = accuracy_score(y_val, y_pred)
accuracy

0.8268156424581006

In [None]:
# Making predictions on the test dataset
# Load the test data
test_data = pd.read_csv('user_data/20250405_114119-38a81897-6191-4924-854a-e4fc9cc3ba8d/test.csv')

test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.')
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
test_data = pd.get_dummies(test_data, columns=['Embarked', 'Title'], drop_first=True)

# Dropping unused columns like Name, Ticket, PassengerId and Cabin
X_test = test_data.drop(columns=['Name', 'Ticket', 'PassengerId', 'Cabin'])

# Filling missing values in Age and Fare
X_test['Age'].fillna(X_test['Age'].median(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].median(), inplace=True)

# Making predictions
predictions = model.predict(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['Age'].fillna(X_test['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['Fare'].fillna(X_test['Fare'].median(), inplace=True)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Title_Dona
Feature names seen at fit time, yet now missing:
- Title_Col
- Title_Countess
- Title_Don
- Title_Jonkheer
- Title_Lady
- ...


In [None]:
# Fixing the test data preprocessing to match training feature set
# Identify additional features in the training set that are missing in the test set
# Fill missing values properly
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].median())
X_test['Fare'] = X_test['Fare'].fillna(X_test['Fare'].median())

# Ensure all the same columns as the training set are present in the test set
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Making predictions with the fixed test data
predictions = model.predict(X_test)

In [None]:
# Prepare the submission file
import pandas as pd
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})
submission.to_csv('submission.csv', index=False)

In [None]:
# Submitting the predictions (simulating submission)
# Here we would provide instructions on how to submit the 'submission.csv' file to Kaggle if run in a local environment, or we can just finish up
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [None]:
# Indicate that task is complete and ready for submission. Simulation of the submission process.
print('The submission file has been prepared successfully. It is ready for submission to Kaggle.')

The submission file has been prepared successfully. It is ready for submission to Kaggle.
