In [155]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [156]:
# Load the dataset
df = pd.read_csv("0000000000002429_training_titanic_x_y_train.csv")

# Drop columns that are not useful for prediction
df.drop(["Name", "Ticket"], axis=1, inplace=True)

# Convert 'Survived' to binary values
df['Sex'] = df['Sex'].astype('category').cat.codes
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Fill missing values in 'Age' with mean age, then convert to integer
df.fillna({'Age': df['Age'].mean()}, inplace=True)

df['Cabin'] = df['Cabin'].str.extract('([A-Za-z])')[0].astype('category').cat.codes
df.fillna({'Cabin': df['Cabin'].mean()}, inplace=True)

# Feature engineering: create squared terms for each feature to capture non-linear patterns
# for col in df.columns[:-1]:
#     df[f"{col}_{col}"] = df[col] ** 2

In [157]:
# Split dataset into features (X) and target (y)
x = df.drop('Survived', axis=1).values
y = df['Survived'].values

# Standardize features for better model performance
scaler = StandardScaler()
x = scaler.fit_transform(x)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [158]:
# Train logistic regression model
model = LogisticRegression(C=0.1, max_iter=10000, solver='lbfgs', random_state=42)
model.fit(x_train, y_train)

In [159]:
# Print accuracy scores
print("Training Score :", model.score(x_train, y_train))
print("Testing Score :", model.score(x_test, y_test))

y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Training Score : 0.7902621722846442
Testing Score : 0.8208955223880597
[[64  9]
 [15 46]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84        73
           1       0.84      0.75      0.79        61

    accuracy                           0.82       134
   macro avg       0.82      0.82      0.82       134
weighted avg       0.82      0.82      0.82       134



In [160]:
# # Load the dataset
df = pd.read_csv("0000000000002429_test_titanic_x_test.csv")

# Drop columns that are not useful for prediction
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# Convert categorical variables to numerical codes
df['Sex'] = df['Sex'].astype('category').cat.codes
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Fill missing values in 'Age' with mean age, then convert to integer
df.fillna({'Age': df['Age'].mean()}, inplace=True)

# Feature engineering: create squared terms for each feature to capture non-linear patterns
# for col in df.columns:
#     df[f"{col}_{col}"] = df[col] ** 2

In [161]:
# Split dataset into features (X) and target (y)
x = scaler.transform(df.values)

# Make predictions on the test set
predictions = model.predict(x)

# Save predictions to CSV file
np.savetxt("predictions.csv", predictions, delimiter=",", comments='', fmt='%d')

ValueError: X has 8 features, but StandardScaler is expecting 9 features as input.