In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [None]:
# Add data labels
def set_data_labels(ax):
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f'{height:.0f}', (p.get_x() + p.get_width() / 2., height),
                    ha='center', va='bottom', fontsize=12, color='black')

# Set a prettier style
sns.set_style("darkgrid")
sns.set_palette("pastel")

In [None]:
# Load the training dataset
train_df = pd.read_csv('/kaggle/input/titanic-survival-prediction/train.csv')

# Load the testing dataset
test_df = pd.read_csv('/kaggle/input/titanic-survival-prediction/test.csv')

## Explorarory Data Analysis (EDA)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
test_df.isnull().sum()

In [None]:
# Survival Count
plt.figure(figsize=(4, 8))
ax = sns.countplot(x='Survived', data=train_df)
set_data_labels(ax)
plt.title('Survival Count')
plt.show()

In [None]:
# Survival Count by Pclass
ax = sns.countplot(x='Pclass', hue='Survived', data=train_df)
set_data_labels(ax)
plt.title('Survival Count by Class')
plt.show()

In [None]:
# Survival Count by Sex
ax = sns.countplot(x='Sex', hue='Survived', data=train_df)
set_data_labels(ax)
plt.title('Survival Count by Class')
plt.show()

In [None]:
# Survival Rate by Class and Gender
ax = sns.barplot(x='Pclass', y='Survived', hue='Sex', data=train_df, errorbar=None)

for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height:.2f}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=12, color='black')
        
plt.title('Survival Rate by Class and Gender')
plt.show()

In [None]:
# Sibling/Spouse Count
ax = sns.countplot(x='SibSp', data=train_df, color='skyblue')
set_data_labels(ax)
plt.title('Sibling/Spouse Count')
plt.show()

In [None]:
# Parent/Child Count
ax = sns.countplot(x='Parch', data=train_df, color='skyblue')
set_data_labels(ax)
plt.title('Parent/Child Count')
plt.show()

In [None]:
# Embarked Port Analysis
ax = sns.countplot(x='Embarked', data=train_df)
set_data_labels(ax)
plt.title('Embarked Port Analysis')
plt.show()

In [None]:
# Age Distribution
plt.hist(train_df['Age'].dropna(), bins=20, color='skyblue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Fare Distribution
plt.hist(train_df['Fare'], bins=20, color='skyblue')
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation Heatmap for numerica columns
plt.figure(figsize=(10, 6))
sns.heatmap(train_df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Data Preprocessing

In [None]:
# Imput the missing values in 'Age' column
average_train_age = int(train_df['Age'].mean())
train_df['Age'].fillna(average_train_age, inplace=True)

average_test_age = int(train_df['Age'].mean())
test_df['Age'].fillna(average_test_age, inplace=True)

In [None]:
# Drop rows with NaN values in the 'Embarked' and 'Fare' columns
train_df.dropna(subset=['Embarked', 'Fare'], inplace=True)
test_df.dropna(subset=['Embarked', 'Fare'], inplace=True)

In [None]:
# Drop unnecessary columns
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)

## Feature Engineering

In [None]:
# Extract Title from Name
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Family Size
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# Is Alone
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

# Age Group
bins = [0, 18, 30, 50, 80]
labels = ['Child', 'Young Adult', 'Adult', 'Senior']
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=bins, labels=labels, include_lowest=True).astype(str)
test_df['AgeGroup'] = pd.cut(test_df['Age'], bins=bins, labels=labels, include_lowest=True).astype(str)

# Fare per Person
train_df['FarePerPerson'] = train_df['Fare'] / train_df['FamilySize']
test_df['FarePerPerson'] = test_df['Fare'] / test_df['FamilySize']

# Age Class Interaction
train_df['AgeClassInteraction'] = train_df['Age'] * train_df['Pclass']
test_df['AgeClassInteraction'] = test_df['Age'] * test_df['Pclass']

# Fare Class Interaction
train_df['FareClassInteraction'] = train_df['Fare'] * train_df['Pclass']
test_df['FareClassInteraction'] = test_df['Fare'] * test_df['Pclass']

# Age Group Class Interaction
train_df['AgeGroupClassInteraction'] = train_df['AgeGroup'] + '_' + train_df['Pclass'].astype(str)
test_df['AgeGroupClassInteraction'] = test_df['AgeGroup'] + '_' + test_df['Pclass'].astype(str)

In [None]:
train_df.head()

## Encoding and Splitting the data

In [None]:
# Encode categorical variables
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked', 'Title', 'AgeGroup', 'AgeGroupClassInteraction'], drop_first=True)

# Define features and target variable
X = train_df.drop(['Survived', 'PassengerId', 'Name'], axis=1)
y = train_df['Survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Models Training and Evaluation

In [None]:
# Train and evaluate Logistic Regression
lg_model = LogisticRegression(max_iter=1500)
lg_model.fit(X_train, y_train)
lg_pred = lg_model.predict(X_test)
lg_accuracy = accuracy_score(y_test, lg_pred)
lg_report = classification_report(y_test, lg_pred)

print(f'Logistic Regression Accuracy: {lg_accuracy:.2f}')
print(f'Logistic Regression Report:\n{lg_report}')

In [None]:
# Train and evaluate Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred)

print(f'Random Forest Accuracy: {rf_accuracy:.2f}')
print(f'Random Forest Report:\n{rf_report}')

In [None]:
# Train and evaluate Support Vector Machines (SVM)
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_report = classification_report(y_test, svm_pred)

print(f'SVM Accuracy: {svm_accuracy:.2f}')
print(f'SVM Report:\n{svm_report}')

In [None]:
# Train and evaluate Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
gb_report = classification_report(y_test, gb_pred)

print(f'Gradient Boosting Accuracy: {gb_accuracy:.2f}')
print(f'Gradient Boosting Report:\n{gb_report}')

## Generating and Predicting New Data

In [None]:
# Extend the dictionary to include 10 rows of data
data = {
    'PassengerId': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Pclass': [3, 1, 2, 1, 2, 3, 3, 1, 2, 3],
    'Name': [
        'Doe, Mr. John',
        'Smith, Mrs. Jane',
        'Johnson, Miss. Sarah',
        'Brown, Mr. Michael',
        'Davis, Miss. Laura',
        'Clark, Mr. James',
        'Evans, Mrs. Emily',
        'Lee, Mr. David',
        'Hall, Mr. Thomas',
        'Baker, Miss. Jessica'
    ],
    'Sex': ['male', 'female', 'female', 'male', 'female', 'male', 'female', 'male', 'male', 'female'],
    'Age': [30.0, 35.0, 20.0, 45.0, 25.0, 28.0, 35.0, 40.0, 50.0, 22.0],
    'SibSp': [0, 1, 1, 0, 1, 0, 0, 1, 0, 1],
    'Parch': [0, 1, 1, 1, 0, 0, 2, 2, 1, 0],
    'Fare': [8.0, 80.0, 20.0, 120.0, 30.0, 10.0, 40.0, 200.0, 50.0, 15.0],
    'Embarked': ['S', 'C', 'S', 'C', 'Q', 'S', 'S', 'C', 'S', 'S'],
    'Title': ['Mr', 'Mrs', 'Miss', 'Mr', 'Miss', 'Mr', 'Mrs', 'Mr', 'Mr', 'Miss'],
    'FamilySize': [1, 3, 3, 2, 1, 1, 3, 4, 2, 2],
    'IsAlone': [1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
    'AgeGroup': ['Adult', 'Adult', 'Young Adult', 'Adult', 'Young Adult', 'Young Adult', 'Adult', 'Adult', 'Senior', 'Young Adult'],
    'FarePerPerson': [8.0, 40.0, 10.0, 60.0, 30.0, 10.0, 13.33, 50.0, 25.0, 7.5],
    'AgeClassInteraction': [90.0, 35.0, 40.0, 45.0, 50.0, 84.0, 105.0, 40.0, 100.0, 66.0],
    'FareClassInteraction': [24.0, 80.0, 40.0, 120.0, 60.0, 30.0, 80.0, 200.0, 100.0, 45.0],
    'AgeGroupClassInteraction': ['Adult_3', 'Adult_1', 'Young Adult_2', 'Adult_1', 'Young Adult_2', 'Young Adult_3', 'Adult_3', 'Adult_1', 'Senior_1', 'Young Adult_3']
}

# Create the DataFrame
new_data = pd.DataFrame(data)

In [None]:
# Drop unnecessary columns
X_new = new_data.drop(['PassengerId', 'Name'], axis=1)

# Apply the same one-hot encoding to the categorical columns
X_new = pd.get_dummies(X_new, columns=['Sex', 'Embarked', 'Title', 'AgeGroup'], drop_first=True)

# Some columns might be missing due to absence in the new_data. Add them as all-zeros columns.
missing_columns = set(X.columns) - set(X_new.columns)
for col in missing_columns:
    X_new[col] = 0

# Ensure the column order is the same as in the original training data
X_new = X_new[X.columns]

# Predict using the logistic regression model
predictions = lg_model.predict(X_new)

# Add the predictions to the new_data DataFrame
new_data['PredictedSurvived'] = predictions

# Display the results
new_data

## Conclusion

After thorough analysis and model evaluation, we found that the Logistic Regression model provided the highest accuracy so we used it to make predictions for the new data.

## Made by: Abdelrahman Eldaba