In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings

In [None]:
# Load train and test datasets
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [24]:
_data = pd.read_csv("test.csv")

# Check for null values
print("Train Data Null Values:\n", train_data.isnull().sum())
print("\nTest Data Null Values:\n", test_data.isnull().sum())

# Impute missing values for Age with the median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

# Fill missing values for Embarked with the most common value
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Create a new feature 'Has_Cabin' indicating whether Cabin information is available
train_data['Has_Cabin'] = train_data['Cabin'].notnull().astype(int)
test_data['Has_Cabin'] = test_data['Cabin'].notnull().astype(int)

# Drop unnecessary columns
train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Convert categorical variables to numerical using one-hot encoding
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])

# Ensure 'Age' column exists
if 'Age' not in train_data.columns:
    print("Column 'Age' does not exist in the train_data.")

# Split data into features (X) and target variable (y)
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']

# Train-test split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Scale numerical features
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']
scaler = StandardScaler()
X_train_split[numerical_features] = scaler.fit_transform(X_train_split[numerical_features])
X_val_split[numerical_features] = scaler.transform(X_val_split[numerical_features])

# Train a simple Logistic Regression model
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    logistic_model = LogisticRegression(max_iter=1000, random_state=42)
    logistic_model.fit(X_train_split, y_train_split)

# Make predictions on the training set
train_predictions = logistic_model.predict(X_train_split)

# Make predictions on the test set
test_predictions = logistic_model.predict(X_val_split)

# Evaluate the model on the training set
train_accuracy = accuracy_score(y_train_split, train_predictions)
print(f"Training Accuracy: {train_accuracy}")

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_val_split, test_predictions)
print(f"Testing Accuracy: {test_accuracy}")


Train Data Null Values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Test Data Null Values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
Training Accuracy: 0.800561797752809
Testing Accuracy: 0.8212290502793296
