In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [None]:
# Load the data. make sure to replace 'the_path_to_the_data.csv' with the actual file path.
data = pd.read_csv("the_path_to_the_data.csv")

# Clean and Transform the data if necessary
# ....
# ....
# ....

In [None]:
# A list of columns(Features) you do not want to train the model on with the intention to remove them.
X_drop_cols = ["variable_1", "variable_1", "variable_1", "variable_1", "variable_1"]

In [None]:
# Separate features (X) and target variable (y)
X = data.drop(X_drop_cols + ["target_variable"], axis=1)  
# Replace "target_variable" with the actual target column name here 

y = data["target_variable"]

In [None]:
# Split data into train, validation, and test sets.
# 80% of the initial data will be assigned to the train set.
# 10% will be assigned to the validation set.
# another 10% will be used as the hold out (to represent `new data` going into the already trained model.)

X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)

# We use `random_state=42` for reproducibility on different seassion or computer.

##### Imporve model performence by using different combinations of hyperparameter 

In [None]:
# Define hyperparameter search space
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

# Create a RandomForestRegressor model
rf_model = RandomForestClassifier(random_state=42)

# Use RandomizedSearchCV for hyperparameter tuning with cross-validation
grid_search  = GridSearchCV(
    rf_model, 
    param_grid=param_grid, 
    cv=5, 
    scoring="accuracy", 
    random_state=42
)

grid_search .fit(X_train, y_train)

# Get the best model with tuned hyperparameters (The set of estimator that achieved the best accuracy)
best_rf_model = grid_search .best_estimator_

# You can access the tuned hyperparameters using best_rf_model.get_params()
print("Best Hyperparameters:", best_rf_model.get_params())

# Train the best model on the entire training set.
best_rf_model.fit(X_train, y_train)

# Make predictions on the validation set (for the model evaluation)
y_pred_val = best_rf_model.predict(X_val)

# Evaluate the model performance on the validation set (e.g., using accuracy)
validation_accuracy = accuracy_score(y_val, y_pred_val)
print("Random Forest model validation accuracy:", validation_accuracy)

#### New data (test dataset)

In [None]:
# Make predictions on the completely unseen test set (final evaluation)
y_pred_test = best_rf_model.predict(X_test)

# Evaluate the model performance on the test set (e.g., using accuracy)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Random Forest model Test Accuracy:", test_accuracy)