In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv("../sample/2023_smartFarm_AI_hackathon_dataset.csv")

# Drop columns with all zeros
cols_to_drop = data.columns[data.sum(axis=0) == 0]
data.drop(columns=cols_to_drop, inplace=True)

# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Convert frmDist to categorical
data['frmDist'] = data['frmDist'].astype('category')

# Interpolate zeros for each zone
for zone in data['frmDist'].cat.categories:
    mask = (data['frmDist'] == zone)
    zone_data = data[mask]
    
    # Set 0 values to NaN
    zone_data = zone_data.where(zone_data != 0, np.nan)
    
    # Apply forward fill
    interpolated_zone_data = zone_data.fillna(method='ffill')
    
    # Apply backward fill for any remaining NaNs
    interpolated_zone_data = interpolated_zone_data.fillna(method='bfill')
    
    # Update the main dataframe
    data.loc[mask] = interpolated_zone_data

# Handle any residual NaN after interpolation
data.fillna(0, inplace=True)

# Splitting data
X = data.drop(columns=['outtrn_cumsum', 'HeatingEnergyUsage_cumsum', 'date', 'frmDist'])
y = data[['outtrn_cumsum', 'HeatingEnergyUsage_cumsum']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")


RMSE: 52362.63606104533
R2 Score: 0.9917412381122765


In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv("../sample/2023_smartFarm_AI_hackathon_dataset.csv")

# Drop columns with all zeros
cols_to_drop = data.columns[data.sum(axis=0) == 0]
data.drop(columns=cols_to_drop, inplace=True)

# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Convert frmDist to categorical
data['frmDist'] = data['frmDist'].astype('category')

# Interpolate zeros for each zone
for zone in data['frmDist'].cat.categories:
    mask = (data['frmDist'] == zone)
    zone_data = data[mask]
    
    # Set 0 values to NaN
    zone_data = zone_data.where(zone_data != 0, np.nan)
    
    # Apply forward fill
    interpolated_zone_data = zone_data.fillna(method='ffill')
    
    # Apply backward fill for any remaining NaNs
    interpolated_zone_data = interpolated_zone_data.fillna(method='bfill')
    
    # Update the main dataframe
    data.loc[mask] = interpolated_zone_data

# Handle any residual NaN after interpolation
data.fillna(0, inplace=True)

# Splitting data
X = data.drop(columns=['outtrn_cumsum', 'HeatingEnergyUsage_cumsum', 'date', 'frmDist'])
y = data[['outtrn_cumsum', 'HeatingEnergyUsage_cumsum']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")


RMSE: 52362.63606104533
R2 Score: 0.9917412381122765


In [13]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Load data
data = pd.read_csv("../sample/2023_smartFarm_AI_hackathon_dataset.csv")

# Drop columns with all zeros
cols_to_drop = data.columns[data.sum(axis=0) == 0]
data.drop(columns=cols_to_drop, inplace=True)

# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Convert frmDist to categorical
data['frmDist'] = data['frmDist'].astype('category')

# Interpolation for zeros
for zone in data['frmDist'].cat.categories:
    mask = (data['frmDist'] == zone)
    zone_data = data[mask].copy()
    
    # Replace 0 values with NaN
    zone_data = zone_data.replace(0, np.nan)
    
    # Apply forward fill
    interpolated_zone_data = zone_data.fillna(method='ffill')
    
    # Apply backward fill for any remaining NaNs
    interpolated_zone_data = interpolated_zone_data.fillna(method='bfill')
    
    # Update the main dataframe
    data.loc[mask] = interpolated_zone_data

# Fill any residual NaN after interpolation
data.fillna(0, inplace=True)

# Splitting data
features = data.drop(columns=['outtrn_cumsum', 'HeatingEnergyUsage_cumsum', 'date', 'frmDist'])
target = data[['outtrn_cumsum', 'HeatingEnergyUsage_cumsum']]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train CatBoost for the first target
model1 = CatBoostRegressor(verbose=0)
model1.fit(X_train, y_train['outtrn_cumsum'])

# Predict the first target on train and test sets
X_train['outtrn_pred'] = model1.predict(X_train)
X_test['outtrn_pred'] = model1.predict(X_test)

# Train CatBoost for the second target using the predictions as an additional feature
model2 = CatBoostRegressor(verbose=0)
model2.fit(X_train, y_train['HeatingEnergyUsage_cumsum'])

# Predictions
y_pred1 = model1.predict(X_test.drop(columns=['outtrn_pred']))
y_pred2 = model2.predict(X_test)

# Evaluation
rmse1 = np.sqrt(mean_squared_error(y_test['outtrn_cumsum'], y_pred1))
rmse2 = np.sqrt(mean_squared_error(y_test['HeatingEnergyUsage_cumsum'], y_pred2))
r2_1 = r2_score(y_test['outtrn_cumsum'], y_pred1)
r2_2 = r2_score(y_test['HeatingEnergyUsage_cumsum'], y_pred2)

print(f"RMSE for outtrn_cumsum: {rmse1}")
print(f"R2 Score for outtrn_cumsum: {r2_1}")
print(f"RMSE for HeatingEnergyUsage_cumsum: {rmse2}")
print(f"R2 Score for HeatingEnergyUsage_cumsum: {r2_2}")


RMSE for outtrn_cumsum: 2048.501143499915
R2 Score for outtrn_cumsum: 0.9976682782916031
RMSE for HeatingEnergyUsage_cumsum: 71172.34785581533
R2 Score for HeatingEnergyUsage_cumsum: 0.9865338120408076
