In [1]:
import os
import pandas as pd
import geopandas as gpd
from scipy.spatial import KDTree
from sklearn.preprocessing import StandardScaler
import joblib

main_dir = os.getcwd().rsplit("\\", 2)[0]

In [2]:
osm_processed_df = pd.read_csv(f"{main_dir}/data/processed/osm_site_processed.csv")
sentiment_df = pd.read_csv(f"{main_dir}/data/processed/combined_sentiment_data.csv")
merged_df_read = pd.read_csv(f"{main_dir}/data/processed/merged_data.csv")

In [3]:
merged_df_read['date'] = pd.to_datetime(merged_df_read['date'])
merged_df_read.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63204202 entries, 0 to 63204201
Data columns (total 16 columns):
 #   Column              Dtype         
---  ------              -----         
 0   date                datetime64[ns]
 1   energy_source       object        
 2   power_MW            float64       
 3   maintenance_status  int64         
 4   respondent          object        
 5   latitude            float64       
 6   longitude           float64       
 7   capacity_MW         float64       
 8   year                int64         
 9   plantcode           float64       
 10  plantname           object        
 11  nearest_station     object        
 12  wind_speed          float64       
 13  precipitation       float64       
 14  temperature_avg     float64       
 15  wind_volatility     float64       
dtypes: datetime64[ns](1), float64(9), int64(2), object(4)
memory usage: 7.5+ GB


In [4]:
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             257 non-null    datetime64[ns]
 1   sentiment_score  257 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.1 KB


In [5]:
# Merge with sentiment data
#merged_df['date'] = pd.to_datetime(merged_df['date']).dt.date
#sentiment_df['date'] = pd.to_datetime(sentiment_df['date']).dt.date
merged_df = pd.merge(merged_df_read, sentiment_df, on='date', how='left')
merged_df['sentiment_score'] = merged_df['sentiment_score'].fillna(0)

In [6]:
# Merge with OSM processed data for site_density
merged_df = pd.merge(merged_df, osm_processed_df[['latitude', 'longitude', 'site_density']], 
                     on=['latitude', 'longitude'], how='left')
merged_df['site_density'] = merged_df['site_density'].fillna(0)

In [7]:
# Feature engineering
merged_df['output_efficiency'] = merged_df['power_MW'] / merged_df['capacity_MW'].replace(0, 1)

In [8]:
# Outlier removal
Q1 = merged_df['power_MW'].quantile(0.25)
Q3 = merged_df['power_MW'].quantile(0.75)
IQR = Q3 - Q1
merged_df = merged_df[(merged_df['power_MW'] >= Q1 - 1.5*IQR) & (merged_df['power_MW'] <= Q3 + 1.5*IQR)]

In [9]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['wind_speed', 'precipitation', 'temperature_avg', 'wind_volatility', 'power_MW', 'site_density', 'output_efficiency']
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols].fillna(0))

In [10]:
joblib.dump(scaler, f"{main_dir}/models/scaler.pkl")

['c:\\Users\\stuar\\Desktop\\Renewable Energy Maintenance/models/scaler.pkl']

In [14]:
merged_df.head()

Unnamed: 0,date,energy_source,power_MW,maintenance_status,respondent,latitude,longitude,capacity_MW,year,plantcode,plantname,nearest_station,wind_speed,precipitation,temperature_avg,wind_volatility,sentiment_score,site_density,output_efficiency
0,2025-07-08 04:00:00,solar,-1.014257,1,TEX,31.140744,-98.22948,162956.921267,2025,,,USW00000229,-0.16362,-0.03929,-0.16376,-0.167691,0.0,0.0,-0.793451
1,2025-07-08 04:00:00,wind,0.749863,0,TEX,31.987395,-100.410991,192.115738,2025,,,USW00013962,-0.16362,-0.03929,-0.16376,-0.167691,0.0,0.0,0.893439
2,2025-07-08 03:00:00,solar,-1.014257,1,TEX,31.140744,-98.22948,162956.921267,2025,,,USW00000229,-0.16362,-0.03929,-0.16376,-0.167691,0.0,0.0,-0.793451
3,2025-07-08 03:00:00,wind,0.532338,0,TEX,31.987395,-100.410991,192.115738,2025,,,USW00013962,-0.16362,-0.03929,-0.16376,-0.167691,0.0,0.0,0.685437
4,2025-07-08 02:00:00,solar,-0.875295,0,TEX,31.140744,-98.22948,162956.921267,2025,,,USW00000229,-0.16362,-0.03929,-0.16376,-0.167691,0.0,0.0,-0.793294


In [15]:
cleaned_df = merged_df.copy()
cleaned_df.to_csv(f"{main_dir}/data/processed/cleaned_data_final.csv", index=False)