In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [10]:
# Read the training and testing datasets from CSV files
training_dataset = pd.read_csv('/content/test.csv')
testing_dataset = pd.read_csv('/content/test.csv')


In [90]:
# Duplicate the training dataset for later use
complete_data = pd.read_csv('/content/train.csv')

In [91]:
# Total number of entries in the training dataset
total_entries = 891

In [92]:
# Create a random order permutation of the total entries
random_order = np.random.permutation(total_entries)

In [93]:
# Split ratio for training and validation data
ratio = 0.7

In [94]:
# Calculate the index to split the data into training and validation sets
train_end = int(total_entries * ratio)

In [95]:
# Divide the random order indices into training and validation IDs
train_ids = random_order[:train_end]
validate_ids = random_order[train_end:]

In [96]:
# Create training and validation subsets from the complete data
training_subset = complete_data.iloc[train_ids]
validation_subset = complete_data.iloc[validate_ids]

In [97]:
# Print the sizes of the training and validation subsets
print("Training subset size:", len(training_subset))
print("Validation subset size:", len(validation_subset))

Training subset size: 623
Validation subset size: 268


In [98]:
# Add a new column 'IsChild' based on the 'Age' column
complete_data['IsChild'] = (complete_data['Age'] <= 15).astype(int)

In [99]:
# Fill missing values in the 'Embarked' column with the mode value
complete_data['Embarked'].fillna(complete_data['Embarked'].mode()[0], inplace=True)


In [100]:
# Initialize a SimpleImputer to fill missing 'Age' values with the mean
age_imputer = SimpleImputer(strategy='mean')
complete_data['Age'] = age_imputer.fit_transform(complete_data['Age'].values.reshape(-1, 1))


In [101]:
# Replace categorical values in 'Sex' and 'Embarked' columns with numerical values
complete_data['Sex'] = complete_data['Sex'].replace({'male': 0, 'female': 1})
complete_data['Embarked'] = complete_data['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2})


In [102]:
# Drop unnecessary columns from the complete dataset
complete_data.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)


In [103]:
# Define features and target variables
features = complete_data.drop('Survived', axis=1)
target = complete_data['Survived']

In [104]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=11)


In [105]:
# Create a dictionary of models to be tested
model_dict = {
    "Logistic Regression": LogisticRegression(random_state=11, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=11),
    "Random Forest": RandomForestClassifier(random_state=11)
}

In [106]:
# Loop through the models and fit them to the training data
for name, model in model_dict.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    score = accuracy_score(y_test, predictions)
    print(f"Accuracy for {name}: {score}")

Accuracy for Logistic Regression: 0.8582089552238806
Accuracy for Decision Tree: 0.7089552238805971
Accuracy for Random Forest: 0.8097014925373134


In [107]:
# Create a Decision Tree model for further analysis
dt_model = DecisionTreeClassifier(random_state=11)

In [108]:
# Perform cross-validation on the Decision Tree model
cross_val_scores = cross_val_score(dt_model, X_train, y_train, cv=10)
print("Decision Tree Cross-Validation Accuracy:", cross_val_scores.mean())


Decision Tree Cross-Validation Accuracy: 0.7560931899641576


In [109]:
# Fit the Decision Tree model to the training data
dt_model.fit(X_train, y_train)

In [110]:
# Calculate feature importances for the Decision Tree model
importances = dt_model.feature_importances_
print("Decision Tree Feature Importance:")
for feature, importance in zip(features.columns, importances):
    print(f"{feature}: {importance}")

Decision Tree Feature Importance:
PassengerId: 0.22726608511844604
Pclass: 0.11020614526611024
Sex: 0.26642838547818015
Age: 0.18017855151093387
SibSp: 0.026352695859398274
Parch: 0.018550874596509104
Fare: 0.13776249019582693
Embarked: 0.021541391521268532
IsChild: 0.01171338045332682


In [111]:
# Define a parameter grid for hyperparameter tuning
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [112]:
# Create a grid search to find the best hyperparameters for the Decision Tree model
grid = GridSearchCV(dt_model, parameters, cv=5)
grid.fit(X_train, y_train)

In [113]:
# Get the optimized Decision Tree model with the best parameters
optimized_dt = grid.best_estimator_
print("Optimized Decision Tree Parameters:", grid.best_params_)


Optimized Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}


In [114]:
# Make predictions using the optimized Decision Tree model on the test data
optimized_predictions = optimized_dt.predict(X_test)
optimized_score = accuracy_score(y_test, optimized_predictions)
print("Optimized Decision Tree Accuracy:", optimized_score)

Optimized Decision Tree Accuracy: 0.7835820895522388


In [115]:
# Read the submission data from the test dataset
submission_data = pd.read_csv('/content/test.csv')


In [116]:
# Add 'IsChild' column to the submission data
submission_data['IsChild'] = (submission_data['Age'] <= 15).astype(int)


In [119]:
# Prepare the submission data for prediction by dropping unnecessary columns
X_submission = submission_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])


In [120]:
# Fit the optimized Decision Tree model to the complete dataset
optimized_dt.fit(features, target)

In [124]:
# Create the 'IsChild' feature for the submission data
submission_data['IsChild'] = (submission_data['Age'] <= 15).astype(int)


In [125]:
# Replace categorical values with numerical values in the submission data
submission_data['Sex'] = submission_data['Sex'].replace({'male': 0, 'female': 1})
submission_data['Embarked'] = submission_data['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2})


In [126]:
# Impute missing values in the 'Age' column of submission data
submission_data['Age'] = age_imputer.transform(submission_data['Age'].values.reshape(-1, 1))


In [131]:
# Drop unnecessary columns from the submission data
X_submission = submission_data.drop(columns=['Name', 'Ticket', 'Cabin'])


In [132]:
# Ensure that the 'PassengerId' column is present in X_submission
# creating a placeholder 'PassengerId' column with zeros
X_submission['PassengerId'] = 0


In [133]:
# Reorder the columns to match the order used during training
X_submission = X_submission[features.columns]


In [135]:
# Check for and handle any remaining missing values in the dataset
X_submission.fillna(0, inplace=True)  # Fill NaN values with zeros (or you can choose an appropriate value)


In [136]:
# Make predictions on the submission data
submission_predictions = optimized_dt.predict(X_submission)

In [137]:
# Add the 'Survived' column to the submission data
submission_data['Survived'] = submission_predictions

In [138]:
# Save the submission data to a CSV file
submission_data[['PassengerId', 'Survived']].to_csv('Decision_Tree_submission.csv', index=False)


Queries

Data Interpretation: Utilize visual tools such as pie charts or bar graphs to demonstrate survival proportions in different demographic groups. This aids in visually representing the survival ratio within distinct segments.

Data Grouping: For calculating survival rates, segment the data by key demographics like age, gender, and socioeconomic status. An example would be assessing the survival percentage in each segment.

Insight: A significant insight could be the higher survival rates of women and children. This can be substantiated by data indicating greater survival percentages for these groups.

Comparative Survival Across Socioeconomic Tiers:

Data Interpretation: Employ bar graphs to delineate survival differences among various socioeconomic tiers, such as first, second, and third classes. This method effectively illustrates the disparity in survival outcomes.

Data Grouping: To evaluate the survival rate per socioeconomic tier, categorize the data accordingly and ascertain the survival percentage in each tier.

Finding: A notable finding might be the markedly superior survival rate of first-class passengers. This can be inferred from data illustrating a distinct survival advantage in this group.

Age-Based Survival Analysis:

Data Interpretation: To explore the interplay between age and survival, apply histograms that merge age distribution with survival data. This approach facilitates the identification of age-related survival trends.

Data Grouping: To study survival patterns related to age, organize the data into age brackets (such as children, young adults, middle-aged, and seniors) and calculate survival rates for each bracket.

Observation: It might be observed that younger passengers had an enhanced likelihood of survival. Analysis suggests that younger age groups exhibited higher survival rates.

Gender-Based Survival Analysis:

Data Interpretation: To contrast survival rates across genders, employ bar graphs or pie charts to depict the survival percentages of males and females.

Data Grouping: Determine survival rates by gender, calculating them separately for males and females.

Finding: It might be observed that female passengers experienced higher survival rates than male passengers, as indicated by the data.

Impact of Family Ties:

Data Interpretation: Investigate how familial connections affected survival by comparing the survival rates of passengers with families versus those traveling alone.

Data Grouping: Analyze survival rates by categorizing passengers based on family size, differentiating between those with and without families.

Conclusion: A possible conclusion is that passengers traveling with family members had a marginal survival advantage, as suggested by the data indicating slightly better survival rates for these passengers.

Through these analytical methods, valuable insights can be gleaned from the Titanic dataset, uncovering trends and determinants that impacted passenger survival during the catastrophe.