In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
import geopandas as gpd
from textblob import TextBlob
import os

In [19]:
main_dir = os.getcwd().rsplit("\\", 2)[0]

# Cleaning Weather Data - NCEI Texas - Hourly

In [42]:

weather_df = pd.read_csv(f"{main_dir}/data/raw/texas_weather_data.csv")
weather_df.head()

Unnamed: 0,date,station,AWND,PRCP,TMAX,TMIN
0,2015-01-01T00:00:00,GHCND:USC00412114,0.4,9.9,6.1,2.2
1,2015-01-01T00:00:00,GHCND:USC00413340,8.9,1.5,7.2,5.0
2,2015-01-01T00:00:00,GHCND:USC00414792,3.3,3.3,2.8,1.1
3,2015-01-01T00:00:00,GHCND:USC00418040,3.6,0.0,-5.0,-14.4
4,2015-01-01T00:00:00,GHCND:USC00418862,3.6,9.7,4.4,1.1


In [37]:
def clean_weather_data(df):
    # Convert 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Sort by date for time interpolation
    df = df.sort_values('date')

    # Convert required columns to float (if they aren't already)
    df['wind_speed'] = df['AWND'].astype(float)
    df['precipitation'] = df['PRCP'].astype(float)
    df['temperature_avg'] = (df['TMAX'].astype(float) + df['TMIN'].astype(float)) / 2

    # Set date as index for time-based interpolation
    df = df.set_index('date')

    # Interpolate missing values by time
    df[['wind_speed', 'precipitation', 'temperature_avg']] = df[['wind_speed', 'precipitation', 'temperature_avg']].interpolate(method='time')

    # Calculate wind_speed volatility as rolling std over 7 days (assuming data is hourly)
    # Adjust window depending on your data frequency!
    # For daily data use window=7, for hourly 7*24, etc.
    window_size = 7 * 24  # Update this if your data is daily: use window_size=7
    df['wind_volatility'] = df['wind_speed'].rolling(window=window_size).std()

    # Reset index to bring 'date' back as a column
    df = df.reset_index()

    # Select relevant columns
    output_cols = ['date', 'station', 'wind_speed', 'precipitation', 'temperature_avg', 'wind_volatility']

    # Drop rows missing more than 30% of expected data (i.e. keep rows with >=70% non-null)
    # thresh counts non-NA values needed, so calculate accordingly
    thresh_count = int(len(output_cols) * 0.7)

    return df[output_cols].dropna(thresh=thresh_count)

cleaned_weather_df = clean_weather_data(weather_df)
cleaned_weather_df.head()

Unnamed: 0,date,station,wind_speed,precipitation,temperature_avg,wind_volatility
0,2015-01-01,GHCND:USC00412114,0.4,9.9,4.15,
1,2015-01-01,GHCND:USW00053903,2.0,31.8,4.75,
2,2015-01-01,GHCND:USW00053902,2.4,11.9,5.55,
3,2015-01-01,GHCND:USW00023091,2.1,0.8,-4.9,
4,2015-01-01,GHCND:USW00023047,2.6,0.0,-5.45,


In [43]:
cleaned_weather_df.to_csv(f"{main_dir}/data/processed/cleaned_weather_data.csv", index=False)

## Clean Energy Data - EIA Texas - Hourly

In [45]:
clean_energy_hr_df = pd.read_csv(f"{main_dir}/data/raw/eia_hourly_texas_raw.csv")
clean_energy_hr_df.head()

Unnamed: 0,period,respondent,respondent-name,fueltype,type-name,value,value-units
0,2025-07-08T04,TEX,Texas,SUN,Solar,0,megawatthours
1,2025-07-08T04,TEX,Texas,WND,Wind,11946,megawatthours
2,2025-07-08T03,TEX,Texas,SUN,Solar,0,megawatthours
3,2025-07-08T03,TEX,Texas,WND,Wind,10473,megawatthours
4,2025-07-08T02,TEX,Texas,SUN,Solar,941,megawatthours


In [83]:
clean_energy_hr_df_copy = clean_energy_hr_df.copy()

In [84]:
clean_energy_hr_df_copy['operationalstatus'] = clean_energy_hr_df_copy['value'].apply(lambda x: 'offline' if x == 0 else 'online')
clean_energy_hr_df_copy.rename(columns={'value': 'power'}, inplace=True)

In [85]:
clean_energy_hr_df_copy.head()

Unnamed: 0,period,respondent,respondent-name,fueltype,type-name,power,value-units,operationalstatus
0,2025-07-08T04,TEX,Texas,SUN,Solar,0,megawatthours,offline
1,2025-07-08T04,TEX,Texas,WND,Wind,11946,megawatthours,online
2,2025-07-08T03,TEX,Texas,SUN,Solar,0,megawatthours,offline
3,2025-07-08T03,TEX,Texas,WND,Wind,10473,megawatthours,online
4,2025-07-08T02,TEX,Texas,SUN,Solar,941,megawatthours,online


In [89]:

def clean_energy_data(df):
    df['date'] = pd.to_datetime(df['period'])
    df['power_MW'] = df['power'].astype(float)
    df['maintenance_status'] = df['operationalstatus'].map({'online': 0, 'offline': 1}).fillna(0)
    df['energy_source'] = df['fueltype'].map({'WND': 'wind', 'SUN': 'solar'})
    df = df[df['power_MW'].notnull()]
    return df[['date', 'energy_source', 'power_MW', 'maintenance_status', 'respondent']]
final_cleaned_energy_hr_df = clean_energy_data(clean_energy_hr_df_copy)
final_cleaned_energy_hr_df.head()

Unnamed: 0,date,energy_source,power_MW,maintenance_status,respondent
0,2025-07-08 04:00:00,solar,0.0,1,TEX
1,2025-07-08 04:00:00,wind,11946.0,0,TEX
2,2025-07-08 03:00:00,solar,0.0,1,TEX
3,2025-07-08 03:00:00,wind,10473.0,0,TEX
4,2025-07-08 02:00:00,solar,941.0,0,TEX


In [90]:
final_cleaned_energy_hr_df.to_csv(f"{main_dir}/data/processed/final_cleaned_energy_hr.csv", index=False)

## Clean Energy Data - EIA Texas - Annual

In [113]:
cleaned_energy_annual_df = pd.read_csv(f"{main_dir}/data/raw/eia_annual_texas_filtered.csv")
cleaned_energy_annual_df.head(1)

Unnamed: 0,year,plantCode,plantName,fuelType,state,grossGeneration,grossGenerationUnit
0,2024,62562,"High Lonesome Wind Power, LLC Hybrid",WND,TX,1184601.84,megawatthours


In [114]:
cleaned_energy_annual_df.rename(columns={'grossGeneration': 'generation', 'plantCode':'plantcode',
                                         'plantName':'plantname'}, inplace=True)

In [115]:
def clean_annual_energy_data(df):
    df = df[df['state'] == 'TX']
    df['generation'] = df['generation'].astype(float)
    df['energy_source'] = df['fuelType'].map({'WND': 'wind', 'SUN': 'solar'})
    return df[['year', 'plantcode', 'plantname', 'energy_source', 'generation']]
cleaned_energy_annual_df = clean_annual_energy_data(cleaned_energy_annual_df)
cleaned_energy_annual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6554 entries, 0 to 6553
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           6554 non-null   int64  
 1   plantcode      6554 non-null   int64  
 2   plantname      6554 non-null   object 
 3   energy_source  6554 non-null   object 
 4   generation     6554 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 256.1+ KB


In [97]:
cleaned_energy_annual_df.to_csv(f"{main_dir}/data/processed/final_cleaned_energy_year.csv", index=False)

## Clean USGS Turbine Data

In [100]:
df_usgs = pd.read_csv(f"{main_dir}/data/raw/usgs_turbines_raw.csv")
df_usgs.head()

Unnamed: 0,case_id,state,plant_name,longitude,latitude,capacity_MW,year_operational,manufacturer,model,plantcode
0,3011397,TX,Big Spring I,-101.4363,32.19,34.32,1999.0,Vestas,V47-0.66,54979.0
1,3007980,TX,Anacacho,-100.18203,29.18049,99.0,2012.0,Vestas,V100-1.8,58000.0
2,3107784,TX,Amadeus,-100.5874,32.87516,250.12,2020.0,GE Wind,GE1.79-100,62142.0
3,3107783,TX,Amadeus,-100.60146,32.87976,250.12,2020.0,GE Wind,GE1.79-100,62142.0
4,3106735,TX,Amadeus,-100.60215,32.8946,250.12,2020.0,GE Wind,GE1.79-100,62142.0


In [117]:
df_usgs['plantcode'].unique() #58000.0
# drop rows with NaN plantcode before converting to int
df_usgs = df_usgs[df_usgs['plantcode'].notna()]
df_usgs['plantcode'] = df_usgs['plantcode'].astype(int)

In [118]:
def clean_usgs_turbine_data(df):
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)
    df['capacity_MW'] = df['capacity_MW'].astype(float)
    df['plantcode'] = df['plantcode'].astype(int)  # Ensure plantcode is string for consistency
    return df[['latitude', 'longitude', 'capacity_MW', 'plantcode']].dropna()

cleaned_usgs_df = clean_usgs_turbine_data(df_usgs)
cleaned_usgs_df.head()

Unnamed: 0,latitude,longitude,capacity_MW,plantcode
0,32.19,-101.4363,34.32,54979
1,29.18049,-100.18203,99.0,58000
2,32.87516,-100.5874,250.12,62142
3,32.87976,-100.60146,250.12,62142
4,32.8946,-100.60215,250.12,62142


In [119]:
cleaned_usgs_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18901 entries, 0 to 19026
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   latitude     18901 non-null  float64
 1   longitude    18901 non-null  float64
 2   capacity_MW  18901 non-null  float64
 3   plantcode    18901 non-null  int64  
dtypes: float64(3), int64(1)
memory usage: 738.3 KB


In [120]:
cleaned_usgs_df.to_csv(f"{main_dir}/data/processed/final_cleaned_usgs.csv", index=False)

## Clean Reddit and News Sentiment Data


In [121]:
reddit_sentiment_df = pd.read_csv(f"{main_dir}/data/raw/reddit_sentiment_raw.csv")
news_sentiment_df = pd.read_csv(f"{main_dir}/data/raw/news_sentiment_raw.csv")

In [123]:
reddit_sentiment_df.head(1)

Unnamed: 0,subreddit,date,text,sentiment_score,title,url
0,renewableenergy,2025-01-13 15:02:26,"Texas leads U.S. in wind, solar, No. 2 in batt...",0.2732,"Texas leads U.S. in wind, solar, No. 2 in batt...",https://www.chron.com/news/houston-texas/artic...


In [124]:
news_sentiment_df.head(1)

Unnamed: 0,date,text,sentiment_score
0,2025-07-22 20:32:04.283424,\n\n\nData centers are building their own gas ...,0.5719


In [132]:
def compute_sentiment(reddit_sentiment_df, news_sentiment_df):
    reddit_sentiment_df['date'] = pd.to_datetime(reddit_sentiment_df['date']).dt.date
    news_sentiment_df['date'] = pd.to_datetime(news_sentiment_df['date']).dt.date
    combined_df = pd.concat([reddit_sentiment_df[['date','sentiment_score']],
                             news_sentiment_df[['date','sentiment_score']]], ignore_index=True)
    return combined_df.groupby('date').mean().reset_index()

combined_sentiment_df = compute_sentiment(reddit_sentiment_df, news_sentiment_df)
combined_sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             257 non-null    object 
 1   sentiment_score  257 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.1+ KB


In [133]:
combined_sentiment_df.to_csv(f"{main_dir}/data/processed/combined_sentiment_data.csv", index=False)

## Get Lat Long from OSM - Compute Site Density

In [242]:
geo_df = pd.read_csv(f"{main_dir}/data/raw/osm_lat_long_raw.csv")
geo_df.head(1)

Unnamed: 0,power,generator:source,generator:method,generator:output:electricity,name,latitude,longitude
0,generator,wind,wind_turbine,1.5 MW,,32.338763,-99.984451


In [243]:
geo_df['generator:source'].unique()  # Check unique energy sources

array(['wind', 'solar'], dtype=object)

In [245]:
# def compute_site_density(geo_df):
#     geo_df['site_density'] = 0
#     for i, row in geo_df.iterrows():
#         point = (row['latitude'], row['longitude'])
#         geo_df.at[i, 'site_density'] = sum(
#             geodesic(point, (r['latitude'], r['longitude'])).km < 50
#             for j, r in geo_df.iterrows() if i != j
#         )
#     return geo_df[['latitude', 'longitude', 'site_density']]
# geo_final_df = compute_site_density(geo_df)
# geo_final_df.head(1)

In [246]:
# Use BallTree with haversine metric for memory efficiency
from sklearn.neighbors import BallTree
import numpy as np

def compute_site_density(geo_df, radius_km=50):
    coords = geo_df[['latitude', 'longitude']].to_numpy()
    coords_rad = np.radians(coords)
    earth_radius = 6371.0
    tree = BallTree(coords_rad, metric='haversine')
    # radius in radians
    rad = radius_km / earth_radius
    # Query neighbors within radius
    counts = tree.query_radius(coords_rad, r=rad, count_only=False)
    # Subtract 1 to exclude self
    site_density = [len(c) - 1 for c in counts]
    geo_df = geo_df.copy()
    geo_df['site_density'] = site_density
    return geo_df[['latitude', 'longitude', 'site_density']]

geo_final_df = compute_site_density(geo_df)
geo_final_df.head(1)

Unnamed: 0,latitude,longitude,site_density
0,32.338763,-99.984451,1477


In [247]:
geo_final_df.head()

Unnamed: 0,latitude,longitude,site_density
0,32.338763,-99.984451,1477
1,32.332758,-99.986133,1481
2,32.332738,-99.988417,1486
3,32.332788,-99.990568,1487
4,32.333316,-99.992595,1491


In [248]:
geo_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59892 entries, 0 to 59891
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   latitude      59892 non-null  float64
 1   longitude     59892 non-null  float64
 2   site_density  59892 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 1.4 MB


In [250]:

geo_final_df.to_csv(f"{main_dir}/data/processed/osm_site_processed.csv", index=False)

## Clean Lat Long from OSM -

In [259]:
geo_df = pd.read_csv(f"{main_dir}/data/raw/osm_lat_long_raw.csv")
geo_df.head(1)

Unnamed: 0,power,generator:source,generator:method,generator:output:electricity,name,latitude,longitude
0,generator,wind,wind_turbine,1.5 MW,,32.338763,-99.984451


In [260]:
geo_df_copy = geo_df.copy()
geo_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59892 entries, 0 to 59891
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   power                         59647 non-null  object 
 1   generator:source              59892 non-null  object 
 2   generator:method              57792 non-null  object 
 3   generator:output:electricity  56725 non-null  object 
 4   name                          683 non-null    object 
 5   latitude                      59892 non-null  float64
 6   longitude                     59892 non-null  float64
dtypes: float64(2), object(5)
memory usage: 3.2+ MB


In [262]:
geo_df_copy['energy_source'] = geo_df_copy['generator:source'].str.lower()
# parse generator:output:electricity to capacity_MW
geo_df_copy['capacity_MW'] = geo_df_copy['generator:output:electricity'].str.extract(r'(\d+\.?\d*)').astype(float)
# Fill NaN values in capacity_MW with the mean of the column
geo_df_copy['capacity_MW'] = geo_df_copy['capacity_MW'].fillna(geo_df_copy['capacity_MW'].mean())
geo_df_copy.tail()

Unnamed: 0,power,generator:source,generator:method,generator:output:electricity,name,latitude,longitude,energy_source,capacity_MW
59887,generator,solar,photovoltaic,yes,,29.77356,-95.409866,solar,17.140309
59888,generator,solar,photovoltaic,yes,,30.456382,-97.828254,solar,17.140309
59889,generator,solar,photovoltaic,yes,,30.456346,-97.828255,solar,17.140309
59890,generator,solar,photovoltaic,yes,,30.45631,-97.827954,solar,17.140309
59891,generator,solar,photovoltaic,yes,,30.456278,-97.827964,solar,17.140309


In [263]:
geo_df_copy_filtered = geo_df_copy[['latitude', 'longitude', 'energy_source', 'capacity_MW']].dropna()

In [265]:
geo_df_copy_filtered.to_csv(f"{main_dir}/data/processed/osm_lat_long_all_processed.csv", index=False)