In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from google.colab import files


In [3]:
#  Upload train.csv
print("Please upload 'train.csv'")
uploaded_train = files.upload()
train = pd.read_csv('train.csv')
print("train.csv uploaded successfully!")


Please upload 'train.csv'


Saving train.csv to train.csv
train.csv uploaded successfully!


In [9]:
#  Upload test.csv
print("Please upload 'test.csv'")
uploaded_test = files.upload()
test = pd.read_csv('test.csv')
print("test.csv uploaded successfully!")


Please upload 'test.csv'


Saving test (1).csv to test (1) (2).csv
test.csv uploaded successfully!


In [10]:
print("Please upload 'gender_submission.csv'")
uploaded_gender_submission = files.upload()
gender_submission = pd.read_csv('gender_submission.csv')
print("gender_submission.csv uploaded successfully!")


Please upload 'gender_submission.csv'


Saving gender_submission.csv to gender_submission (1).csv
gender_submission.csv uploaded successfully!


In [11]:
# Fill missing Age with median
age_imputer = SimpleImputer(strategy='median')
train['Age'] = age_imputer.fit_transform(train[['Age']])

if 'Age' not in test.columns:
    if 'age' in test.columns:
        test['Age'] = test['age']
    else:
        test['Age'] = age_imputer.fit_transform(train[['Age']]).ravel()[:len(test)]

test['Age'] = age_imputer.transform(test[['Age']])

embarked_imputer = SimpleImputer(strategy='most_frequent')
train['Embarked'] = embarked_imputer.fit_transform(train['Embarked'].values.reshape(-1, 1)).ravel()

fare_imputer = SimpleImputer(strategy='median')
if 'Fare' not in test.columns:
    if 'fare' in test.columns:
        test.rename(columns={'fare': 'Fare'}, inplace=True)
    else:
        test['Fare'] = train['Fare'].median()

test['Fare'] = fare_imputer.fit_transform(test[['Fare']])

if 'Cabin' in train.columns:
    train.drop(columns=['Cabin'], inplace=True)

if 'Cabin' in test.columns:
    test.drop(columns=['Cabin'], inplace=True)

label_encoder = LabelEncoder()
train['Sex'] = label_encoder.fit_transform(train['Sex'])


if 'Sex' not in test.columns:
    if 'sex' in test.columns:
        test.rename(columns={'sex': 'Sex'}, inplace=True)
    else:

        test['Sex'] = train['Sex'].mode()[0]
        print("Warning: 'Sex' or 'sex' column not found in test DataFrame. Filled with most frequent value from train.")

test['Sex'] = label_encoder.transform(test['Sex'])
train['Embarked'] = label_encoder.fit_transform(train['Embarked'])

# Check if 'Embarked' column exists in test before transforming
if 'Embarked' in test.columns:
    test['Embarked'] = label_encoder.transform(test['Embarked'])
else:
    test['Embarked'] = embarked_imputer.transform(test[['Embarked']]) if 'Embarked' in test.columns else embarked_imputer.fit_transform(train[['Embarked']]).ravel()[:len(test)]
    print("Warning: 'Embarked' column not found in test DataFrame. Filled with most frequent value from train.")



In [18]:
# Feature Selection
X = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = train['Survived']

for col in X.columns:
    if col not in test.columns:
        test[col] = 0

X_test = test[X.columns]  a

missing_cols = list(set(X.columns) - set(test.columns))
if missing_cols:
    print(f"Warning: The following columns are missing in the test DataFrame: {missing_cols}")
    print("Consider adding them or adjusting your feature selection.")

In [19]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Train Logistic Regression Model
logistic_model = LogisticRegression(max_iter=200, random_state=42)
logistic_model.fit(X_train, y_train)


In [21]:
# Validate the Model
y_val_pred = logistic_model.predict(X_val)
logistic_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", logistic_accuracy)

# Classification Report
print("Classification Report:")
print(classification_report(y_val, y_val_pred))


Validation Accuracy: 0.8100558659217877
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [22]:
# Make Predictions on Test Data
y_test_pred = logistic_model.predict(X_test)


In [25]:
# Prepare Submission File
passenger_id_col = 'PassengerId'
if passenger_id_col not in test.columns:

    passenger_id_col = 'passengerid'
    if passenger_id_col not in test.columns:

        passenger_id_col = 'passenger_id'
        if passenger_id_col not in test.columns:

            passenger_id_col = 'Loan_ID'
            if passenger_id_col not in test.columns:
                raise KeyError(f"Could not find Passenger ID column. Available columns: {test.columns}")

submission = test[[passenger_id_col]].copy()
submission['Survived'] = y_test_pred

# Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")

Submission file saved as 'submission.csv'
