In [2]:
import pandas as pd

In [22]:
csv_file_path = "tested.csv"
data = pd.read_csv('tested.csv')

In [24]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [26]:
# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [28]:
# Drop irrelevant columns (PassengerId, Name, Ticket, Cabin)
data_cleaned = data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

In [60]:
# Handle missing values:
imputer_age = SimpleImputer(strategy="median")
data_cleaned["Age"] = imputer_age.fit_transform(data_cleaned[["Age"]])

imputer_embarked = SimpleImputer(strategy="most_frequent")
data_cleaned["Embarked"] = imputer_embarked.fit_transform(data_cleaned[["Embarked"]]).ravel()

In [62]:
# Encode categorical variables: 'Sex' and 'Embarked'
label_encoder_sex = LabelEncoder()
data_cleaned["Sex"] = label_encoder_sex.fit_transform(data_cleaned["Sex"])

label_encoder_embarked = LabelEncoder()
data_cleaned["Embarked"] = label_encoder_embarked.fit_transform(data_cleaned["Embarked"])

In [64]:
# Split features and target variable
X = data_cleaned.drop(columns=["Survived"])
y = data_cleaned["Survived"]

In [66]:
# Scale numerical features ('Age' and 'Fare') using StandardScaler
scaler = StandardScaler()
X[["Age", "Fare"]] = scaler.fit_transform(X[["Age", "Fare"]])

In [68]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
X_train.head(), y_train.head()

(     Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
 336       2    1  0.189203      0      0 -0.405211         2
 31        2    1 -0.441286      2      0 -0.073910         2
 84        2    1 -0.204852      0      0 -0.446251         1
 287       1    1 -0.441286      1      0  0.835227         2
 317       2    1 -0.835341      0      0 -0.449981         2,
 336    0
 31     0
 84     0
 287    0
 317    0
 Name: Survived, dtype: int64)

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [86]:
# Check for any remaining missing values in the dataset
missing_values = X_train.isnull().sum()
missing_values

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [88]:
# Initialize and train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)


In [90]:
# Make predictions on the test data
y_pred = model.predict(X_test)

In [92]:
# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [94]:
accuracy, precision, recall, f1, conf_matrix

(1.0,
 1.0,
 1.0,
 1.0,
 array([[50,  0],
        [ 0, 34]], dtype=int64))

In [96]:
imputer_fare = SimpleImputer(strategy="median")
X_train["Fare"] = imputer_fare.fit_transform(X_train[["Fare"]])
X_test["Fare"] = imputer_fare.transform(X_test[["Fare"]])


In [98]:
# Train the logistic regression model again
model.fit(X_train, y_train)

In [100]:
# Make predictions and evaluate performance
y_pred = model.predict(X_test)


In [102]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, precision, recall, f1, conf_matrix

(1.0,
 1.0,
 1.0,
 1.0,
 array([[50,  0],
        [ 0, 34]], dtype=int64))

In [104]:
# The logistic regression model achieved perfect performance, with an accuracy, precision, recall, and F1 score of 100%
# matrix shows that all 50 passengers who did not survive and all 34 passengers who survived were correctly predicted.