<a href="https://colab.research.google.com/github/Yabudere/Yabu/blob/main/Titanic_ml_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# **Load datasets**

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/train (2).csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')
gender_submission_df = pd.read_csv('/content/drive/MyDrive/gender_submission (2).csv')

# **Display the first few rows of the training dataset**

In [None]:
train_df.head()

**Check for missing datat**

In [None]:
train_df.info()
train_df.isnull().sum()

### **Check  statistical summary to identify potential outliers**

In [None]:
train_df.describe()

**Visualize distributions to detect outliers**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(data=train_df, x='Age')
plt.show()

sns.boxplot(data=train_df, x='Fare')
plt.show()

**Fill missing values in 'Age' with the median age**

In [None]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

# Fill missing values in 'Embarked' with the mode
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column due to a large number of missing values
train_df.drop(columns=['Cabin'], inplace=True)

# Verify that there are no missing values left
train_df.isnull().sum()

**Convert categorical variables into dummy/indicator variables**

In [None]:
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)

# Drop irrelevant columns
train_df.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)

# Display the first few rows of the processed training dataset
train_df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Define features (X) and target (y)
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

**Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train Random Forest model
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

**Model Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate model
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    return accuracy, precision, recall, f1

# Evaluate Logistic Regression model
logreg_metrics = evaluate_model(logreg, X_val, y_val)
print(f"Logistic Regression - Accuracy: {logreg_metrics[0]}, Precision: {logreg_metrics[1]}, Recall: {logreg_metrics[2]}, F1-score: {logreg_metrics[3]}")

# Evaluate Decision Tree model
decision_tree_metrics = evaluate_model(decision_tree, X_val, y_val)
print(f"Decision Tree - Accuracy: {decision_tree_metrics[0]}, Precision: {decision_tree_metrics[1]}, Recall: {decision_tree_metrics[2]}, F1-score: {decision_tree_metrics[3]}")

# Evaluate Random Forest model
random_forest_metrics = evaluate_model(random_forest, X_val, y_val)
print(f"Random Forest - Accuracy: {random_forest_metrics[0]}, Precision: {random_forest_metrics[1]}, Recall: {random_forest_metrics[2]}, F1-score: {random_forest_metrics[3]}")

 **Model Tuning**

In [None]:
# Model Tuning

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate best model
best_model_metrics = evaluate_model(best_model, X_val, y_val)
print(f"Best Random Forest - Accuracy: {best_model_metrics[0]}, Precision: {best_model_metrics[1]}, Recall: {best_model_metrics[2]}, F1-score: {best_model_metrics[3]}")

 **Evaluating and Tuning Random Forest Model Performance Using GridSearchCV**

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate best model
best_model_metrics = evaluate_model(best_model, X_val, y_val)
print(f"Best Random Forest - Accuracy: {best_model_metrics[0]}, Precision: {best_model_metrics[1]}, Recall: {best_model_metrics[2]}, F1-score: {best_model_metrics[3]}")

**Evaluation Metrics for the Best Model**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# To Set Seaborn style
sns.set(style="whitegrid")

# Hyperparameter tuning for Logistic Regression
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_model = grid_search.best_estimator_

# To Evaluate the best model
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# To Plot the evaluation metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
values = [accuracy, precision, recall, f1]

plt.figure(figsize=(8, 6))
ax = sns.barplot(x=metrics, y=values, palette='viridis')

# To Add value labels on the bars
for i, value in enumerate(values):
    ax.text(i, value + 0.02, f"{value:.2f}", ha='center', va='bottom', fontsize=12)

# To Customize plot
plt.xlabel('Metrics', fontsize=14)
plt.ylabel('Score', fontsize=14)
plt.title('Evaluation Metrics for the Best Model', fontsize=16)
plt.ylim(0, 1)  # Assuming the scores are between 0 and 1

# To Show the plot
plt.tight_layout()
plt.show()
