In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Load data
titanic_data = pd.read_csv('titanic.csv')

# Data Cleaning
titanic_data.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)  # Drop unnecessary columns
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)  # Fill missing Age with median
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)  # Fill missing Embarked

# Feature Encoding
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked'], drop_first=True)
titanic_data.replace({True: 1, False: 0}, inplace=True)
print(titanic_data)
# Define X and y
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Fit the model
rf_clf.fit(X_train, y_train)

# Predictions
y_pred = rf_clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature Importance
importances = rf_clf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print("Feature Importances:\n", importance_df)


     PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  Sex_male  \
0              1         0       3  22.0      1      0   7.2500         1   
1              2         1       1  38.0      1      0  71.2833         0   
2              3         1       3  26.0      0      0   7.9250         0   
3              4         1       1  35.0      1      0  53.1000         0   
4              5         0       3  35.0      0      0   8.0500         1   
..           ...       ...     ...   ...    ...    ...      ...       ...   
886          887         0       2  27.0      0      0  13.0000         1   
887          888         1       1  19.0      0      0  30.0000         0   
888          889         0       3  28.0      1      2  23.4500         0   
889          890         1       1  26.0      0      0  30.0000         1   
890          891         0       3  32.0      0      0   7.7500         1   

     Embarked_Q  Embarked_S  
0             0           1  
1             0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)  # Fill missing Age with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)  # Fill missing Embarked
  titanic_data.replace

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load data
titanic_data = pd.read_csv('titanic.csv')

# Data Cleaning
titanic_data.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)  # Drop unnecessary columns
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)  # Fill missing Age with median
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)  # Fill missing Embarked

# Feature Encoding
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked'], drop_first=True)

# Define X and y
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator from GridSearchCV
best_params = grid_search.best_params_
best_rf_clf = grid_search.best_estimator_

print("Best Parameters from GridSearchCV:", best_params,best_rf_clf)

# Make predictions with the best model
y_pred = best_rf_clf.predict(X_test)

# Evaluate the best model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)  # Fill missing Age with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)  # Fill missing Embarked


Best Parameters from GridSearchCV: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100} RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=10,
                       random_state=42)
Accuracy: 0.8156424581005587
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.90      0.85       105
           1       0.83      0.70      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.80       179
weighted avg       0.82      0.82      0.81       179

Confusion Matrix:
 [[94 11]
 [22 52]]
