In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Loading dataset

df = pd.read_csv("tested.csv")

# Drop unnecessary data
df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Encode categorical variables
df["Sex"] = LabelEncoder().fit_transform(df["Sex"])
df["Embarked"] = df["Embarked"].fillna("S")
df["Embarked"] = LabelEncoder().fit_transform(df["Embarked"])

# Fill missing numerical values with mean as few places
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Features
X = df.drop("Survived", axis=1)
y = df["Survived"]

# Train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=200
                                                   )

# Training the model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        58
           1       1.00      1.00      1.00        26

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("tested.csv")  # Replace with the actual path

# Drop irrelevant columns
df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], errors='ignore')

# Encode categorical variables
df["Sex"] = LabelEncoder().fit_transform(df["Sex"])
df["Embarked"] = df["Embarked"].fillna("S")
df["Embarked"] = LabelEncoder().fit_transform(df["Embarked"])

# Fill missing numerical values with column means
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Features and target
X = df.drop("Survived", axis=1)
y = df["Survived"]

# Train-test split with shuffle to avoid sequential data issues
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# Train logistic regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Preview random training samples
sample_data = X_train.copy()
sample_data["Survived"] = y_train
print("\nSample training rows:")
print(sample_data.sample(10, random_state=1))

# Check for duplicates between training and test sets
duplicates = pd.merge(X_test, X_train, how='inner')
print(f"\nNumber of duplicate rows between train and test sets: {len(duplicates)}")

# Check feature correlation with target
print("\nCorrelation with Survived:")
print(df.corr(numeric_only=True)["Survived"].sort_values(ascending=False))
# Combine test features with true labels
test_data = X_test.copy()
test_data["Survived"] = y_test

# Display a random sample of test rows
print("\n📋 Sample Test Data with True Labels:")
print(test_data.sample(10, random_state=2))

# Add actual and predicted labels to the test set
comparison_df = X_test.copy()
comparison_df["Actual_Survived"] = y_test
comparison_df["Predicted_Survived"] = model.predict(X_test)

# Show 10 random rows for verification
print(comparison_df.sample(10, random_state=3))




Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84


Sample training rows:
     Pclass  Sex       Age  SibSp  Parch     Fare  Embarked  Survived
83        3    1  30.27259      0      0   7.8958         2         0
49        3    0  36.00000      0      2  15.9000         2         1
107       3    1  30.27259      0      0   7.7500         1         0
35        3    1  18.50000      0      0   7.2292         0         0
328       2    1  29.00000      1      0  26.0000         2         0
281       3    1   0.75000      1      1  13.7750         2         0
262       2    0  29.00000      0      2  23.0000         2         1
85        3    1  30.27259      1      0  14.4542   