In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import os

#from google.colab import drive
#drive.mount("/content/drive")

In [4]:
directory = 'events'
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Split files into training and testing
train_files = csv_files[:12]
test_files = csv_files[12:17]

# Function to load data from CSV files
def load_data(files, directory):
    data = []
    for file in files:
        df = pd.read_csv(os.path.join(directory, file))
        data.append(df)
    return pd.concat(data, ignore_index=True)

# Load and split data into features and target variable
train_data = load_data(train_files, directory)
test_data = load_data(test_files, directory)
print(train_data)
print(test_data)

train_data['sample_weight'] = train_data['w_depth'].apply(lambda x: 1 if x < 0.2 else 2)
sample_weight = train_data['sample_weight']

X_train = train_data.drop(columns=['w_depth', 'FID_', 'FULLNAME',  'DateTime','sample_weight'])
y_train = train_data['w_depth']
X_test = test_data.drop(columns=['w_depth', 'FID_', 'FULLNAME', 'DateTime'])
y_test = test_data['w_depth']


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

          FID_             FULLNAME   RH     DateTime  MAX15      HR_2  \
0            0  E LITTLE CREEK ROAD  0.0  Jun16_05_01    0.0  0.000000   
1            1  E LITTLE CREEK ROAD  0.0  Jun16_05_01    0.0  0.000000   
2            2  E LITTLE CREEK ROAD  0.0  Jun16_05_01    0.0  0.000000   
3            3  E LITTLE CREEK ROAD  0.0  Jun16_05_01    0.0  0.000000   
4            4  E LITTLE CREEK ROAD  0.0  Jun16_05_01    0.0  0.000000   
...        ...                  ...  ...          ...    ...       ...   
6493120  17497       CHILDRENS LANE  0.0  May18_06_23    0.0  0.002941   
6493121  17498       CHILDRENS LANE  0.0  May18_06_23    0.0  0.002891   
6493122  17499       FAIRFAX AVENUE  0.0  May18_06_23    0.0  0.003018   
6493123  17500       FAIRFAX AVENUE  0.0  May18_06_23    0.0  0.003077   
6493124  17501          WOOD STREET  0.0  May18_06_23    0.0  0.006269   

            HR_72  w_depth  TD_HR       ELV          DTW       TWI  
0        0.000000   0.0000 -0.126  3.44965

In [None]:
rf_model = RandomForestRegressor(
    n_estimators=50,       # Number of trees
    max_depth=30,
    
    #max_features=5,           # Limit tree depth
        # Feature limit per split
    #min_samples_split=2,   # Min samples per split
    #min_samples_leaf=1,     # Min samples per leaf
    #max_samples=0.8,        # Fraction of data for each tree (if applicable)
    random_state=42
)
rf_model.fit(X_train_scaled, y_train, sample_weight=sample_weight)



In [None]:
y_pred_rf_train = rf_model.predict(X_train_scaled)
y_pred_rf_test = rf_model.predict(X_test_scaled)



rf_train_r2 = r2_score(y_train, y_pred_rf_train)
rf_test_r2 = r2_score(y_test, y_pred_rf_test)


print(f'Random Forest - Training R^2: {rf_train_r2}')
print(f'Random Forest - Testing R^2: {rf_test_r2}')

Random Forest - Training R^2: 0.9939097277103367
Random Forest - Testing R^2: 0.9396058349358315


In [None]:
ab_model = AdaBoostRegressor(base_estimator=rf_model, n_estimators=50, random_state=42)
ab_model.fit(X_train_scaled, y_train)



In [None]:

y_pred_ab_train = ab_model.predict(X_train_scaled)
y_pred_ab_test = ab_model.predict(X_test_scaled)

In [None]:

ab_train_r2 = r2_score(y_train, y_pred_ab_train)
ab_test_r2 = r2_score(y_test, y_pred_ab_test)


print(f'AdaBoost - Training R^2: {rf_train_r2}')
print(f'AdaBoost - Testing R^2: {rf_test_r2}')

AdaBoost - Training R^2: 0.7307827983853854
AdaBoost - Testing R^2: 0.7347184852799835
