In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Load the training data from train.csv
train_data = pd.read_csv('train.csv')

In [3]:
# Set male to 1 and female to 0
train_data['Sex'] = train_data['Sex'].map({'male': 1, 'female': 0})

In [4]:
# Exclude non-numeric columns 
numeric_data = train_data.select_dtypes(include=[np.number])

# Compute the correlation matrix
correlation_matrix = numeric_data.corr()

# Display correlation
survived_correlation = correlation_matrix['Survived'].sort_values(ascending=False)
print(survived_correlation)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Sex           -0.543351
Name: Survived, dtype: float64


## Random Forest

In [5]:
# Drop non-numeric columns
numeric_train_data = train_data.select_dtypes(include=[np.number])

# Drop any rows with missing values
numeric_train_data = numeric_train_data.dropna()

# Define the features (X) and the target (y)
X = numeric_train_data.drop('Survived', axis=1)
y = numeric_train_data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
random_forest_model = RandomForestClassifier(
    n_estimators=510, 
    random_state=72,
    max_depth=None,
)
# Train the model
random_forest_model.fit(X_train, y_train)

# Make predictions
y_pred = random_forest_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Output the accuracy
print(accuracy)

0.8041958041958042


## Random search

In [6]:
# Drop all non-numeric columns from the dataset to prepare it for Random Forest
numeric_train_data = train_data.select_dtypes(include=[np.number])

# Drop any rows with missing values to avoid errors during model fitting
numeric_train_data = numeric_train_data.dropna()

# Define the features (X) and the target (y)
X = numeric_train_data.drop('Survived', axis=1)
y = numeric_train_data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter grid to be sampled
param_distributions = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize the RandomizedSearchCV object
rf_random_search = RandomizedSearchCV(estimator=rf,
                                      param_distributions=param_distributions,
                                      n_iter=100, # Number of parameter settings that are sampled
                                      cv=5,       # Number of folds in cross-validation
                                      verbose=2,  # Controls the verbosity: the higher, the more messages
                                      random_state=42,
                                      n_jobs=-1)  # Number of jobs to run in parallel

# Fit the random search model
rf_random_search.fit(X_train, y_train)

# Get the best parameters
best_params = rf_random_search.best_params_

# Get the best estimator
best_rf = rf_random_search.best_estimator_

# Output the best parameters
best_params

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_lea

{'n_estimators': 50,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_depth': 10,
 'bootstrap': True}

## Random Forest with new hyper parameters

In [7]:
# Drop non-numeric columns
numeric_train_data = train_data.select_dtypes(include=[np.number])

# Drop any rows with missing values
numeric_train_data = numeric_train_data.dropna()

# Define the features (X) and the target (y)
X = numeric_train_data.drop('Survived', axis=1)
y = numeric_train_data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
random_forest_model = RandomForestClassifier(
    n_estimators=50,
    min_samples_split=5,
    min_samples_leaf=2,
    max_depth=10,
    bootstrap=True
)
# Train the model
random_forest_model.fit(X_train, y_train)

# Make predictions
y_pred = random_forest_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Output the accuracy
print(accuracy)

0.8111888111888111
