In [1]:
from scipy.spatial import cKDTree
import pandas as pd
import time

# Run NA-replacement cell toggle
run_na_rp = False

# Importing csvs
df_s = pd.read_csv("../Data/csvs/location_agg.csv")
df_w = pd.read_csv("../Data/csvs/weather.csv")

In [2]:
# Checking lat/lon ranges of sun-info data set
print("lat. range", df_s["lat_2dp"].min(), df_s["lat_2dp"].max())
print("long. range", df_s["lon_2dp"].min(), df_s["lon_2dp"].max())
df_s.head()

lat. range 18.85 59.97
long. range -179.22 -66.86


Unnamed: 0,lat_2dp,lon_2dp,pvo,irradiance,elevation
0,18.85,-155.74,4.988375,238.99313,-2015.588257
1,18.85,-155.7,4.975792,238.057365,-628.072632
2,18.85,-155.66,4.981292,238.224194,-996.744873
3,18.85,-155.62,5.004083,238.767144,-1880.15686
4,18.89,-155.78,4.933083,237.785906,-1788.161011


In [3]:
# Checking lat/lon ranges of weather data set
print("lat. range", df_w["latitude"].min(), df_w["latitude"].max())
print("long. range", df_w["longitude"].min(), df_w["longitude"].max())
df_w.head()

lat. range 25.0 50.0
long. range -126.0 -67.0


Unnamed: 0,latitude,longitude,cvh,sd,msl,tcc,t2m,u100,v100,sf,tp
0,25.0,-126.0,0.0,0.0,101863.086,0.752882,291.8836,-2.912823,-4.691815,0.0,0.000361
1,25.0,-125.75,0.0,0.0,101857.07,0.754288,291.86533,-2.82896,-4.727698,0.0,0.000358
2,25.0,-125.5,0.0,0.0,101850.76,0.752182,291.84247,-2.734917,-4.767798,0.0,0.000347
3,25.0,-125.25,0.0,0.0,101844.49,0.753118,291.81613,-2.635326,-4.807383,0.0,0.000348
4,25.0,-125.0,0.0,0.0,101838.52,0.751761,291.78708,-2.535123,-4.84412,0.0,0.000343


In [4]:
# Trimming the data sets to have aligned lat/lon ranges
df_s = df_s[(df_s["lat_2dp"] >= df_w["latitude"].min()) & (df_s["lat_2dp"] <= df_w["latitude"].max())].copy()
df_s = df_s[(df_s["lon_2dp"] >= df_w["longitude"].min()) & (df_s["lon_2dp"] <= df_w["longitude"].max())].copy()
print("lat. range", df_s["lat_2dp"].min(), df_s["lat_2dp"].max())
print("long. range", df_s["lon_2dp"].min(), df_s["lon_2dp"].max())

lat. range 25.01 49.45
long. range -124.86 -67.02


In [5]:
# Merging the datasets
df_s = df_s.rename(columns = {"lat_2dp": "lat", "lon_2dp": "lon"})
df_s = df_s.sort_values(by = "lat")
df_w = df_w.rename(columns = {"latitude": "lat", "longitude": "lon"})
df_w = df_w.sort_values(by = "lat")

df_full = pd.merge(
    df_s, 
    df_w, 
    on = ["lat", "lon"],
    how = "left"
)

df_full.head()

Unnamed: 0,lat,lon,pvo,irradiance,elevation,cvh,sd,msl,tcc,t2m,u100,v100,sf,tp
0,25.01,-80.86,4.806,229.68657,-0.1,,,,,,,,,
1,25.01,-80.38,4.742417,227.823326,-5.324144,,,,,,,,,
2,25.01,-80.46,4.754083,225.595268,-5.826279,,,,,,,,,
3,25.01,-80.5,4.768333,229.447935,-1.299589,,,,,,,,,
4,25.01,-80.58,4.764667,226.067122,-0.945366,,,,,,,,,


In [6]:
if run_na_rp:

    # Setting null weather-info values in the sun-info data set to the values found at the closest lat/lon point in the weather data set
    ## Coord arrays for each data set
    hres_coords = df_s[["lat", "lon"]].values
    lres_coords = df_w[["lat", "lon"]].values

    # Using a KDTree from the weather data coords (lower res df)
    nn_t = cKDTree(lres_coords)

    # Find the nearest neighbor in weather df for each point in sun df
    dists, idxs = nn_t.query(hres_coords, k = 1)

    # Progress checks
    start_time = time.time()
    perc_done = 10

    # Replace null col vals in sun df with closest data point from weather df
    for i in range(df_full.shape[0]):
        
        # Getting closest lat/lon data point
        nn_idx = idxs[i]
        
        # Progress check
        if ((i % int(df_full.shape[0] / 10) == 0) & (i > 0)):
            print(f"Processed {i} of {df_full.shape[0]} rows | {perc_done}% Complete | {int((time.time() - start_time) / 60)} min Elapsed")
            perc_done += 10
        
        # Replacing null values
        for col in df_full.columns[5:]:
            if pd.isna(df_full.loc[i, col]):
                df_full.loc[i, col] = df_w.iloc[nn_idx][col]

In [7]:
if run_na_rp:

    # Checking for any null values
    print(df_full.isna().sum().sum())
    print(df_full.shape)

    # Exporting to csv
    df_full.to_csv("../Data/csvs/full_data.csv", index = False, encoding = "utf-8")

In [9]:
# Viewing the data
df_full = pd.read_csv("../Data/csvs/full_data.csv")
print(f"{df_full.shape[0]} Data Points | {df_full.shape[1]} Features")
df_full.head()

421883 Data Points | 14 Features


Unnamed: 0,lat,lon,pvo,irradiance,elevation,cvh,sd,msl,tcc,t2m,u100,v100,sf,tp
0,25.01,-80.86,4.806,229.68657,-0.1,0.0,0.0,101686.95,0.475711,298.0391,-2.719398,-0.084908,0.0,0.002281
1,25.01,-80.38,4.742417,227.823326,-5.324144,0.0,0.0,101687.93,0.483685,298.14987,-2.745007,-0.021706,0.0,0.002306
2,25.01,-80.46,4.754083,225.595268,-5.826279,0.0,0.0,101687.93,0.483685,298.14987,-2.745007,-0.021706,0.0,0.002306
3,25.01,-80.5,4.768333,229.447935,-1.299589,0.0,0.0,101687.93,0.483685,298.14987,-2.745007,-0.021706,0.0,0.002306
4,25.01,-80.58,4.764667,226.067122,-0.945366,0.0,0.0,101687.93,0.483685,298.14987,-2.745007,-0.021706,0.0,0.002306
