In [1]:
import os
import pandas as pd
import geopandas as gpd


In [2]:
main_dir = os.getcwd().rsplit("\\", 2)[0]

In [41]:
# first read all the cleaned data files
weather_df = pd.read_csv(f"{main_dir}/data/processed/cleaned_weather_data.csv")
energy_df = pd.read_csv(f"{main_dir}/data/processed/final_cleaned_energy_hr.csv")
annual_energy_df = pd.read_csv(f"{main_dir}/data/processed/final_cleaned_energy_year.csv")
usgs_df = pd.read_csv(f"{main_dir}/data/processed/final_cleaned_usgs.csv")
osm_raw_df =   pd.read_csv(f"{main_dir}/data/processed/osm_lat_long_all_processed.csv")
osm_processed_df = pd.read_csv(f"{main_dir}/data/processed/osm_site_processed.csv")
sentiment_df = pd.read_csv(f"{main_dir}/data/processed/combined_sentiment_data.csv")
annual_energy_df.rename(columns={
    'generation': 'capacity_MW'
}, inplace=True)
print(weather_df.columns)
print(energy_df.columns)
print(annual_energy_df.columns)
print(usgs_df.columns)
print(osm_raw_df.columns)
print(osm_processed_df.columns)
print(sentiment_df.columns)

Index(['date', 'station', 'wind_speed', 'precipitation', 'temperature_avg',
       'wind_volatility'],
      dtype='object')
Index(['date', 'energy_source', 'power_MW', 'maintenance_status',
       'respondent'],
      dtype='object')
Index(['year', 'plantcode', 'plantname', 'energy_source', 'capacity_MW'], dtype='object')
Index(['latitude', 'longitude', 'capacity_MW', 'plantcode'], dtype='object')
Index(['latitude', 'longitude', 'energy_source', 'capacity_MW'], dtype='object')
Index(['latitude', 'longitude', 'site_density'], dtype='object')
Index(['date', 'sentiment_score'], dtype='object')


In [4]:
osm_processed_df.head()

Unnamed: 0,latitude,longitude,site_density
0,32.338763,-99.984451,1477
1,32.332758,-99.986133,1481
2,32.332738,-99.988417,1486
3,32.332788,-99.990568,1487
4,32.333316,-99.992595,1491


In [5]:
# Merge USGS with annual energy data to get energy source 
usgs_df = pd.merge(usgs_df, annual_energy_df[['plantcode', 'energy_source','plantname']],
                   on='plantcode', how='left')

In [6]:
usgs_df['energy_source'] = usgs_df['energy_source'].fillna('wind')

In [7]:
# drop rows with missing coordinates
usgs_df = usgs_df.dropna(subset=['latitude', 'longitude'])

In [8]:
usgs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393409 entries, 0 to 393408
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   latitude       393409 non-null  float64
 1   longitude      393409 non-null  float64
 2   capacity_MW    393409 non-null  float64
 3   plantcode      393409 non-null  int64  
 4   energy_source  393409 non-null  object 
 5   plantname      393069 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 18.0+ MB


In [9]:

## Create coordinate an capacity mappings for wind and solar
wind_coords = usgs_df[usgs_df['energy_source'] == 'wind'][['latitude', 'longitude', 'capacity_MW']].mean().to_dict()


In [10]:
solar_coords = osm_raw_df[osm_raw_df['energy_source'] == 'solar'][['latitude', 'longitude','capacity_MW']].mean().to_dict()

In [11]:
solar_capacity = annual_energy_df[annual_energy_df['energy_source'] == 'solar']['capacity_MW'].mean()

In [12]:
# Assign coordinates and capacity to hourly energy data
energy_df['latitude'] = energy_df['energy_source'].map({
    'wind': wind_coords.get('latitude'),
    'solar': solar_coords.get('latitude',31.9686)
})
energy_df['longitude'] = energy_df['energy_source'].map({
    'wind': wind_coords.get('longitude'),
    'solar': solar_coords.get('longitude',-99.9018)
})
energy_df['capacity_MW'] = energy_df['energy_source'].map({
    'wind': wind_coords.get('capacity_MW'),
    'solar': solar_capacity if not pd.isna(solar_capacity) else osm_raw_df['capacity_MW'].mean()
})

In [13]:
energy_df.head()

Unnamed: 0,date,energy_source,power_MW,maintenance_status,respondent,latitude,longitude,capacity_MW
0,2025-07-08 04:00:00,solar,0.0,1,TEX,31.140744,-98.22948,162956.921267
1,2025-07-08 04:00:00,wind,11946.0,0,TEX,31.987395,-100.410991,192.115738
2,2025-07-08 03:00:00,solar,0.0,1,TEX,31.140744,-98.22948,162956.921267
3,2025-07-08 03:00:00,wind,10473.0,0,TEX,31.987395,-100.410991,192.115738
4,2025-07-08 02:00:00,solar,941.0,0,TEX,31.140744,-98.22948,162956.921267


In [14]:
annual_energy_df.head()

Unnamed: 0,year,plantcode,plantname,energy_source,capacity_MW
0,2024,62562,"High Lonesome Wind Power, LLC Hybrid",wind,1184601.84
1,2024,56291,Horse Hollow Wind Energy Center,wind,2617502.0
2,2024,56291,Horse Hollow Wind Energy Center,wind,2617502.0
3,2024,56395,Mesquite Wind Power LLC,wind,388599.0
4,2024,56395,Mesquite Wind Power LLC,wind,388599.0


In [15]:
# merge with annal energy data for plant names
# 2025-07-08 04:00:00	- get year from date
energy_df['date'] = pd.to_datetime(energy_df['date'])
energy_df['year'] = energy_df['date'].dt.year
energy_df = pd.merge(energy_df, annual_energy_df[['year','plantcode', 'plantname']],
                     on='year', how='left')

In [16]:
energy_df.head()

Unnamed: 0,date,energy_source,power_MW,maintenance_status,respondent,latitude,longitude,capacity_MW,year,plantcode,plantname
0,2025-07-08 04:00:00,solar,0.0,1,TEX,31.140744,-98.22948,162956.921267,2025,,
1,2025-07-08 04:00:00,wind,11946.0,0,TEX,31.987395,-100.410991,192.115738,2025,,
2,2025-07-08 03:00:00,solar,0.0,1,TEX,31.140744,-98.22948,162956.921267,2025,,
3,2025-07-08 03:00:00,wind,10473.0,0,TEX,31.987395,-100.410991,192.115738,2025,,
4,2025-07-08 02:00:00,solar,941.0,0,TEX,31.140744,-98.22948,162956.921267,2025,,


In [17]:
# count nan values in each column
energy_df['plantname'].isna().sum()

np.int64(9034)

## Preprocessing for  Cleaned weather data

In [18]:

weather_df.head()

Unnamed: 0,date,station,wind_speed,precipitation,temperature_avg,wind_volatility
0,2015-01-01,GHCND:USC00412114,0.4,9.9,4.15,
1,2015-01-01,GHCND:USW00053903,2.0,31.8,4.75,
2,2015-01-01,GHCND:USW00053902,2.4,11.9,5.55,
3,2015-01-01,GHCND:USW00023091,2.1,0.8,-4.9,
4,2015-01-01,GHCND:USW00023047,2.6,0.0,-5.45,


In [19]:
### preprocessing for weather data
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459640 entries, 0 to 459639
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             459640 non-null  object 
 1   station          459640 non-null  object 
 2   wind_speed       459640 non-null  float64
 3   precipitation    459640 non-null  float64
 4   temperature_avg  459640 non-null  float64
 5   wind_volatility  459473 non-null  float64
dtypes: float64(4), object(2)
memory usage: 21.0+ MB


In [20]:
weather_df['date'] = pd.to_datetime(weather_df['date'])

In [21]:
# add header for this 
#ID            1-11   Character
# LATITUDE     13-20   Real
# LONGITUDE    22-30   Real
# ELEVATION    32-37   Real
# STATE        39-40   Character
# NAME         42-71   Character
# GSN FLAG     73-75   Character
# HCN/CRN FLAG 77-79   Character
# WMO ID 

# ACW00011604	17.1167	-61.7833	10.1	ST JOHNS COOLIDGE FLD	Unnamed: 5	Unnamed: 6	Unnamed: 7


station_cords = pd.read_fwf(
    "https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt")
station_cords.columns = [
    'station_id', 'latitude', 'longitude', 'elevation', 'name', 
    'state', 'gsn_flag', 'wmo_id'
]
station_cords.head()

Unnamed: 0,station_id,latitude,longitude,elevation,name,state,gsn_flag,wmo_id
0,ACW00011647,17.1333,-61.7833,19.2,ST JOHNS,,,
1,AE000041196,25.333,55.517,34.0,SHARJAH INTER. AIRP,,GSN,41196.0
2,AEM00041194,25.255,55.364,10.4,DUBAI INTL,,,41194.0
3,AEM00041217,24.433,54.651,26.8,ABU DHABI INTL,,,41217.0
4,AEM00041218,24.262,55.609,264.9,AL AIN INTL,,,41218.0


In [22]:
station_cords = station_cords[station_cords['station_id'].str.startswith(('USC', 'USW'))]

In [23]:
station_cords.head()

Unnamed: 0,station_id,latitude,longitude,elevation,name,state,gsn_flag,wmo_id
101746,USC00010008,31.5703,-85.2483,139.0,ABBEVILLE,,,
101747,USC00010063,34.2108,-87.1783,239.6,ADDISON,,,
101748,USC00010071,34.4167,-87.3167,302.1,ADDISON CNTRL TWR,,,
101749,USC00010117,34.6833,-86.8833,182.9,ALABAMA STATE FARM,,,
101750,USC00010125,31.1333,-85.0667,34.1,ALAGA,,,


In [24]:
weather_df_copy = weather_df.copy()

In [25]:
weather_df_copy.head()

Unnamed: 0,date,station,wind_speed,precipitation,temperature_avg,wind_volatility
0,2015-01-01,GHCND:USC00412114,0.4,9.9,4.15,
1,2015-01-01,GHCND:USW00053903,2.0,31.8,4.75,
2,2015-01-01,GHCND:USW00053902,2.4,11.9,5.55,
3,2015-01-01,GHCND:USW00023091,2.1,0.8,-4.9,
4,2015-01-01,GHCND:USW00023047,2.6,0.0,-5.45,


In [26]:
#GHCND:USC00412114	
weather_df_copy['station'] = weather_df_copy['station'].str.replace('GHCND:', '', regex=False)
weather_df_copy.head()

Unnamed: 0,date,station,wind_speed,precipitation,temperature_avg,wind_volatility
0,2015-01-01,USC00412114,0.4,9.9,4.15,
1,2015-01-01,USW00053903,2.0,31.8,4.75,
2,2015-01-01,USW00053902,2.4,11.9,5.55,
3,2015-01-01,USW00023091,2.1,0.8,-4.9,
4,2015-01-01,USW00023047,2.6,0.0,-5.45,


In [27]:
weather_df_copy = pd.merge(weather_df_copy, station_cords[['station_id', 'latitude', 'longitude']],
                          left_on='station', right_on='station_id', how='left')

In [28]:
weather_df_copy.drop(columns=['station_id'], inplace=True)

In [29]:
weather_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459640 entries, 0 to 459639
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   date             459640 non-null  datetime64[ns]
 1   station          459640 non-null  object        
 2   wind_speed       459640 non-null  float64       
 3   precipitation    459640 non-null  float64       
 4   temperature_avg  459640 non-null  float64       
 5   wind_volatility  459473 non-null  float64       
 6   latitude         459640 non-null  float64       
 7   longitude        459640 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 28.1+ MB


### Weather dataframe cleaned up

In [30]:
# merge with weather data
gdf_weather = gpd.GeoDataFrame(weather_df_copy,
                               geometry= gpd.points_from_xy(weather_df_copy['longitude'], weather_df_copy['latitude']))

In [31]:
gdf_weather.head()

Unnamed: 0,date,station,wind_speed,precipitation,temperature_avg,wind_volatility,latitude,longitude,geometry
0,2015-01-01,USC00412114,0.4,9.9,4.15,,31.3072,-95.4508,POINT (-95.4508 31.3072)
1,2015-01-01,USW00053903,2.0,31.8,4.75,,30.7444,-95.5867,POINT (-95.5867 30.7444)
2,2015-01-01,USW00053902,2.4,11.9,5.55,,30.3611,-95.4175,POINT (-95.4175 30.3611)
3,2015-01-01,USW00023091,2.1,0.8,-4.9,,30.9119,102.9172,POINT (102.9172 30.9119)
4,2015-01-01,USW00023047,2.6,0.0,-5.45,,35.2333,101.7092,POINT (101.7092 35.2333)


In [32]:
gdf_energy = gpd.GeoDataFrame(energy_df,
                               geometry=gpd.points_from_xy(energy_df['longitude'], energy_df['latitude']))

In [33]:
gdf_energy.head()

Unnamed: 0,date,energy_source,power_MW,maintenance_status,respondent,latitude,longitude,capacity_MW,year,plantcode,plantname,geometry
0,2025-07-08 04:00:00,solar,0.0,1,TEX,31.140744,-98.22948,162956.921267,2025,,,POINT (-98.22948 31.14074)
1,2025-07-08 04:00:00,wind,11946.0,0,TEX,31.987395,-100.410991,192.115738,2025,,,POINT (-100.41099 31.9874)
2,2025-07-08 03:00:00,solar,0.0,1,TEX,31.140744,-98.22948,162956.921267,2025,,,POINT (-98.22948 31.14074)
3,2025-07-08 03:00:00,wind,10473.0,0,TEX,31.987395,-100.410991,192.115738,2025,,,POINT (-100.41099 31.9874)
4,2025-07-08 02:00:00,solar,941.0,0,TEX,31.140744,-98.22948,162956.921267,2025,,,POINT (-98.22948 31.14074)


In [34]:
print(energy_df.value_counts())
print(gdf_weather.value_counts())

date                 energy_source  power_MW  maintenance_status  respondent  latitude   longitude    capacity_MW    year  plantcode  plantname                       
2020-05-15 09:00:00  wind           15871.0   0                   TEX         31.987395  -100.410991  192.115738     2020  56673.0    Whirlwind Energy Center             4
                                                                                                                           56754.0    Goat Wind LP                        4
                                                                                                                           56763.0    Capricorn Ridge Wind LLC            4
                                                                                                                           56771.0    Silver Star I Wind Power Project    4
                                                                                                                           56773.0    McAdoo Wind

In [35]:
# Extract unique energy locations
unique_energy = energy_df[['latitude', 'longitude']].drop_duplicates()
gdf_unique_energy = gpd.GeoDataFrame(
    unique_energy, geometry=gpd.points_from_xy(unique_energy['longitude'], unique_energy['latitude']), crs="EPSG:4326"
)

In [36]:
# sjoin_nearest on unique locations
unique_merged = gdf_unique_energy.sjoin_nearest(
    gdf_weather[['geometry']], how='left', max_distance=1.0  # Limit to ~111 km
)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  return geopandas.sjoin_nearest(



In [37]:
# Add nearest station to unique_merged
unique_merged['nearest_station'] = gdf_weather.loc[unique_merged['index_right'].values, 'station'].values
unique_merged = unique_merged[['geometry', 'nearest_station']]

In [39]:
gdf_energy.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 63204202 entries, 0 to 63204201
Data columns (total 12 columns):
 #   Column              Dtype         
---  ------              -----         
 0   date                datetime64[ns]
 1   energy_source       object        
 2   power_MW            float64       
 3   maintenance_status  int64         
 4   respondent          object        
 5   latitude            float64       
 6   longitude           float64       
 7   capacity_MW         float64       
 8   year                int32         
 9   plantcode           float64       
 10  plantname           object        
 11  geometry            geometry      
dtypes: datetime64[ns](1), float64(5), geometry(1), int32(1), int64(1), object(3)
memory usage: 5.4+ GB


In [None]:
# Merge back to gdf_energy
gdf_energy = gdf_energy.merge(unique_merged, on='geometry', how='left')

In [None]:
# Now merge with gdf_weather on date and nearest_station
gdf_weather = gdf_weather.rename(columns={'station': 'nearest_station'})
merged_df = gdf_energy.merge(
    gdf_weather[['date', 'nearest_station', 'wind_speed', 'precipitation', 'temperature_avg', 'wind_volatility']],
    on=['date', 'nearest_station'],
    how='left'
)

In [35]:
# merged_df = gpd.sjoin_nearest(gdf_energy,gdf_weather[['geometry', 'date', 'wind_speed', 'precipitation',
#                                                      'temperature_avg','wind_volatility']
#                                                      ], how='left')

# # # Ensure both GeoDataFrames use the same CRS
# # gdf_energy = gdf_energy.set_crs(epsg=4326, allow_override=True)
# # gdf_weather = gdf_weather.set_crs(epsg=4326, allow_override=True)

# # # Perform the spatial join with only necessary columns
# # merged_df = gpd.sjoin_nearest(
# #     gdf_energy,
# #     gdf_weather[['geometry', 'date', 'wind_speed', 'precipitation', 'temperature_avg', 'wind_volatility']],
# #     how='left'
# # )


In [43]:
# merged_df.head()

### Merged weather data done

### Now merging data with sentiment data

In [None]:
merged_df.info()

In [None]:
sentiment_df.info()

In [None]:
merged_df['date'] = pd.to_datetime(merged_df['date']).dt.date
