In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("train.csv")

df["Age"].fillna(df["Age"].median(), inplace=True)

# Convert categorical variables into numerical variables
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2})

# Drop unnecessary columns
df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

# Split the dataset into training and testing sets
X = df.drop("Survived", axis=1)
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot the distribution of each feature
plt.figure(figsize=(10, 6))

plt.subplot(2, 3, 1)
plt.hist(df["Pclass"], bins=3, edgecolor="black")
plt.title("Pclass Distribution")

plt.subplot(2, 3, 2)
plt.hist(df["Age"], bins=10, edgecolor="black")
plt.title("Age Distribution")

plt.subplot(2, 3, 3)
plt.bar([0, 1], [len(df[df["Sex"] == 0]), len(df[df["Sex"] == 1])])
plt.title("Sex Distribution")

plt.subplot(2, 3, 4)
plt.hist(df["SibSp"], bins=5, edgecolor="black")
plt.title("SibSp Distribution")

plt.subplot(2, 3, 5)
plt.hist(df["Parch"], bins=5, edgecolor="black")
plt.title("Parch Distribution")