1. Load and Explore the Data

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Explore the data
print(train_data.head())
print(train_data.info())
print(train_data.describe())

# Check for missing values
print(train_data.isnull().sum())
print(test_data.isnull().sum())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

2. Data Preprocessing

In [3]:
# Handle missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data.drop(columns=['Cabin'], inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
test_data.drop(columns=['Cabin'], inplace=True)

# Convert categorical variables into numerical format
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])

# Drop unnecessary columns
train_data.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)
test_data.drop(columns=['Name', 'Ticket'], inplace=True)


3. Define Features and Target Variable


In [4]:
# Define features and target variable
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']


4. Split the Data


In [5]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)


5. Build and Train the Model


In [6]:
from sklearn.linear_model import LogisticRegression

# Build and train the model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


6. Make Predictions


In [7]:
# Make predictions on the validation set
val_predictions = model.predict(X_val)

# Make predictions on the test set
X_test = test_data.drop(columns=['PassengerId'])
test_predictions = model.predict(X_test)

# Save the predictions for submission
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!


7. Evaluate the Model


In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Calculate the accuracy on the validation set
accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {accuracy:.2f}")

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_val, val_predictions)
print("Confusion Matrix:")
print(conf_matrix)

# Generate a classification report
class_report = classification_report(y_val, val_predictions)
print("Classification Report:")
print(class_report)


Validation Accuracy: 0.80
Confusion Matrix:
[[92 14]
 [22 51]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       106
           1       0.78      0.70      0.74        73

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



8. Interpret the Results

In [9]:
# Analyze model coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_[0]})
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)
print(coefficients)

# Discuss the model’s performance and suggest potential improvements
print("The model's performance can be evaluated based on accuracy, confusion matrix, and classification report.")
print("Potential improvements might include more feature engineering, trying different models, or tuning hyperparameters.")


      Feature  Coefficient
5  Sex_female     1.399883
8  Embarked_Q     0.172893
7  Embarked_C     0.124790
3       Parch     0.066585
4        Fare     0.001034
1         Age    -0.036701
9  Embarked_S    -0.172046
2       SibSp    -0.364222
0      Pclass    -1.134429
6    Sex_male    -1.274246
The model's performance can be evaluated based on accuracy, confusion matrix, and classification report.
Potential improvements might include more feature engineering, trying different models, or tuning hyperparameters.
