In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('/mnt/data/2023_smartFarm_AI_hackathon_dataset.csv')

# Convert the 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'].astype(str), format='%Y%m%d')

# Drop columns with 100% zeros
zero_percentage = (data == 0).sum() / len(data) * 100
data_cleaned = data.drop(columns=zero_percentage[zero_percentage == 100].index)

# Extracting temporal features
data_cleaned['day_of_week'] = data_cleaned['date'].dt.dayofweek
data_cleaned['month'] = data_cleaned['date'].dt.month
seasons = {12: 'Winter', 1: 'Winter', 2: 'Winter',
           3: 'Spring', 4: 'Spring', 5: 'Spring',
           6: 'Summer', 7: 'Summer', 8: 'Summer',
           9: 'Fall', 10: 'Fall', 11: 'Fall'}
data_cleaned['season'] = data_cleaned['month'].map(seasons)

# Define predictors and target variables
X = data_cleaned.drop(columns=['HeatingEnergyUsage_cumsum', 'outtrn_cumsum', 'date'])

# Feature Engineering - Interaction Terms (As an example)
X['interaction_term_1'] = X['inTp'] * X['inHd']  # Interaction between temperature and humidity
# You can add more interaction terms or other features based on domain knowledge.

# One-hot encoding
X_encoded = pd.get_dummies(X, columns=['frmDist', 'day_of_week', 'month', 'season'])

# Splitting the data
X_train_encoded, X_val_encoded, y_heating_train, y_heating_val = train_test_split(X_encoded, y_heating, test_size=0.2, random_state=42)

# Features to transform
features_to_transform = ['WaterUsage', 'FertilizerUsage', 'CO2Usage']

# Apply the logarithmic transformation just to the training set to reduce memory usage
for feature in features_to_transform:
    X_train_encoded[feature] = np.log1p(X_train_encoded[feature])

# Handling Outliers using IQR for all numeric features
for column in X_train_encoded.select_dtypes(include=[np.number]).columns:
    Q1 = X_train_encoded[column].quantile(0.25)
    Q3 = X_train_encoded[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Capping outliers
    X_train_encoded[column] = np.where(X_train_encoded[column] < lower_bound, lower_bound, X_train_encoded[column])
    X_train_encoded[column] = np.where(X_train_encoded[column] > upper_bound, upper_bound, X_train_encoded[column])

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_val_scaled = scaler.transform(X_val_encoded)

# (If using PCA)
# pca_95 = PCA(n_components=0.95)
# X_train_pca = pca_95.fit_transform(X_train_scaled)
# X_val_pca = pca_95.transform(X_val_scaled)
