# Loading the Dataset


In [None]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv('Titanic/train.csv')
test_data = pd.read_csv('Titanic/test.csv')
gender_submission = pd.read_csv('Titanic/gender_submission.csv')

# Display the first few rows of the training dataset
train_data.head()

# EDA Analysis

In [None]:
# Summary statistics for numeric columns in the training data
summary_stats = train_data.describe()
summary_stats

## Survived

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the distribution of the target variable
sns.countplot(x='Survived', data=train_data)
plt.title('Distribution of Survival')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

## Age Distribution

In [None]:
# Plot the age distribution
sns.histplot(train_data['Age'].dropna(), bins=30, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

## Survival Count by Gender

In [None]:
# Plot survival counts by gender
sns.countplot(x='Survived', hue='Sex', data=train_data)
plt.title('Survival Counts by Gender')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

## Survival Count by Passanger Class

In [None]:
# Plot survival counts by passenger class
sns.countplot(x='Survived', hue='Pclass', data=train_data)
plt.title('Survival Counts by Passenger Class')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

# Data Preprocessing
## Dealing with missing values

In [None]:
# Check for missing values in the training data
missing_values = train_data.isnull().sum()
missing_values

## Impute missing values

In [None]:
# Impute missing values for 'Age' using the median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Impute missing values for 'Embarked' with the mode
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column due to a high number of missing values
train_data.drop(columns=['Cabin'], inplace=True)

# Check for missing values again
train_data.isnull().sum()

## Inspect and hangle outliers

In [None]:
# Inspecting the 'Fare' column for outliers
sns.boxplot(x=train_data['Fare'])
plt.title('Fare Boxplot')
plt.show()

# Removing outliers in 'Fare'
Q1 = train_data['Fare'].quantile(0.25)
Q3 = train_data['Fare'].quantile(0.75)
IQR = Q3 - Q1
outlier_threshold = 1.5 * IQR
train_data = train_data[~((train_data['Fare'] < (Q1 - outlier_threshold)) | (train_data['Fare'] > (Q3 + outlier_threshold)))]


## Normalization

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalizing 'Fare' and 'Age'
scaler = StandardScaler()
train_data[['Fare', 'Age']] = scaler.fit_transform(train_data[['Fare', 'Age']])

## One-hot encoding

In [None]:
# One-hot encoding categorical variables
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked', 'Pclass'], drop_first=True)


# Machine Learning Model
## Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train, validation, and test sets
X = train_data.drop(columns=['Survived', 'Name', 'Ticket', 'PassengerId'])
y = train_data['Survived']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## Training the model with LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)


## Model Evaluation

In [None]:
# Predict probabilities
y_train_pred = model.predict_proba(X_train)[:, 1]
y_val_pred = model.predict_proba(X_val)[:, 1]
y_test_pred = model.predict_proba(X_test)[:, 1]

# Calculate AUC scores
train_auc = roc_auc_score(y_train, y_train_pred)
val_auc = roc_auc_score(y_val, y_val_pred)
test_auc = roc_auc_score(y_test, y_test_pred)

print(f"Training AUC: {train_auc}")
print(f"Validation AUC: {val_auc}")
print(f"Test AUC: {test_auc}")

## Hypermeter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='roc_auc')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation AUC: {best_score}")


## Training the Final Model with Best Parameters

In [None]:
# Train the final model with best parameters
best_model = LogisticRegression(**best_params, max_iter=1000)
best_model.fit(X_train, y_train)

# Evaluate the final model
final_train_pred = best_model.predict_proba(X_train)[:, 1]
final_val_pred = best_model.predict_proba(X_val)[:, 1]
final_test_pred = best_model.predict_proba(X_test)[:, 1]

final_train_auc = roc_auc_score(y_train, final_train_pred)
final_val_auc = roc_auc_score(y_val, final_val_pred)
final_test_auc = roc_auc_score(y_test, final_test_pred)

print(f"Final Training AUC: {final_train_auc}")
print(f"Final Validation AUC: {final_val_auc}")
print(f"Final Test AUC: {final_test_auc}")
