## Data cleaning

In [1]:
import pandas as pd 

In [2]:
data = pd.read_csv('/Users/yunlei/Desktop/MGMT 478/combined dataset final.csv')

In [3]:
data.head()

Unnamed: 0,fips_code,station_id,Latitude,Longitude,Elevation,Month,PRCP,AWND,TMAX,TMIN,TAVG
0,6,GHCND:USC00026250,34.1547,-114.291,128.0,2014-04-01T00:00:00,0.5,,31.8,14.54,23.17
1,6,GHCND:USC00026250,34.1547,-114.291,128.0,2016-05-01T00:00:00,0.0,,,,
2,6,GHCND:USC00026250,34.1547,-114.291,128.0,2016-06-01T00:00:00,0.0,,,,
3,6,GHCND:USC00026250,34.1547,-114.291,128.0,2016-07-01T00:00:00,5.8,,,,
4,6,GHCND:USC00026250,34.1547,-114.291,128.0,2017-06-01T00:00:00,0.0,,43.49,23.91,33.7


In [4]:
data.shape

(84039, 11)

In [5]:
data = data.drop("fips_code", axis = 1)#this column has some useless comment

In [6]:
data.shape

(84039, 10)

In [7]:
# Convert DATE column to datetime format
data['Month'] = pd.to_datetime(data['Month'])

In [8]:
# Extract year and month from DATE as new features
data['YEAR'] = data['Month'].dt.year
data['MONTH'] = data['Month'].dt.month

In [9]:
data.head()

Unnamed: 0,station_id,Latitude,Longitude,Elevation,Month,PRCP,AWND,TMAX,TMIN,TAVG,YEAR,MONTH
0,GHCND:USC00026250,34.1547,-114.291,128.0,2014-04-01,0.5,,31.8,14.54,23.17,2014,4
1,GHCND:USC00026250,34.1547,-114.291,128.0,2016-05-01,0.0,,,,,2016,5
2,GHCND:USC00026250,34.1547,-114.291,128.0,2016-06-01,0.0,,,,,2016,6
3,GHCND:USC00026250,34.1547,-114.291,128.0,2016-07-01,5.8,,,,,2016,7
4,GHCND:USC00026250,34.1547,-114.291,128.0,2017-06-01,0.0,,43.49,23.91,33.7,2017,6


In [10]:
# # Drop the 'SNOW' column
# The "SNOW" column have been dropped before we use the excel
# data_cleaned = data.drop(['SNOW'], axis=1)

In [11]:
# Convert non-numeric to numeric
for column in ['Latitude','Longitude','Elevation','AWND', 'TAVG', 'TMAX', 'TMIN','PRCP']:
    data[column] = pd.to_numeric(data[column], errors='coerce')

In [12]:
data.isnull().sum()

station_id        0
Latitude          0
Longitude         0
Elevation         0
Month             0
PRCP           1866
AWND          20609
TMAX           4203
TMIN           4222
TAVG           4261
YEAR              0
MONTH             0
dtype: int64

In [13]:
data = data.dropna(subset=['PRCP'])

In [14]:
data.shape

(82173, 12)

In [15]:
data.isnull().sum()

station_id        0
Latitude          0
Longitude         0
Elevation         0
Month             0
PRCP              0
AWND          20361
TMAX           4171
TMIN           4187
TAVG           4225
YEAR              0
MONTH             0
dtype: int64

In [16]:
from sklearn.impute import SimpleImputer

In [17]:
# Imputer missing data as median of the column
imputer = SimpleImputer(strategy='median')
data[['AWND', 'TAVG', 'TMAX', 'TMIN']] = imputer.fit_transform(data[['AWND', 'TAVG', 'TMAX', 'TMIN']])

In [18]:
data.isnull().sum()

station_id    0
Latitude      0
Longitude     0
Elevation     0
Month         0
PRCP          0
AWND          0
TMAX          0
TMIN          0
TAVG          0
YEAR          0
MONTH         0
dtype: int64

In [19]:
data.head(10)

Unnamed: 0,station_id,Latitude,Longitude,Elevation,Month,PRCP,AWND,TMAX,TMIN,TAVG,YEAR,MONTH
0,GHCND:USC00026250,34.1547,-114.291,128.0,2014-04-01,0.5,3.3,31.8,14.54,23.17,2014,4
1,GHCND:USC00026250,34.1547,-114.291,128.0,2016-05-01,0.0,3.3,17.83,5.47,11.65,2016,5
2,GHCND:USC00026250,34.1547,-114.291,128.0,2016-06-01,0.0,3.3,17.83,5.47,11.65,2016,6
3,GHCND:USC00026250,34.1547,-114.291,128.0,2016-07-01,5.8,3.3,17.83,5.47,11.65,2016,7
4,GHCND:USC00026250,34.1547,-114.291,128.0,2017-06-01,0.0,3.3,43.49,23.91,33.7,2017,6
5,GHCND:USC00026250,34.1547,-114.291,128.0,2018-04-01,0.0,3.3,33.73,15.68,24.7,2018,4
6,GHCND:USC00026250,34.1547,-114.291,128.0,2018-05-01,0.0,3.3,17.83,5.47,11.65,2018,5
7,GHCND:USC00026250,34.1547,-114.291,128.0,2018-06-01,0.0,3.3,41.71,23.74,32.73,2018,6
8,GHCND:USC00026250,34.1547,-114.291,128.0,2018-07-01,4.1,3.3,43.52,29.18,36.35,2018,7
9,GHCND:USC00026250,34.1547,-114.291,128.0,2018-10-01,24.9,3.3,31.48,17.24,24.36,2018,10


In [20]:
unique_months = data['Month'].unique().max()
unique_months

Timestamp('2023-01-01 00:00:00')

## Using a specific weather station to predict

### Get six near weather station around the specific weather station

In [21]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

stations = data[['station_id', 'Latitude', 'Longitude']].drop_duplicates()
coordinates = stations[['Latitude', 'Longitude']]
neighbors_model = NearestNeighbors(n_neighbors=6)
neighbors_model.fit(coordinates)

def six_nearest_weather_stations(latitude, longitude):
    query_coordinates = np.array([[latitude, longitude]])
    distances, indices = neighbors_model.kneighbors(query_coordinates)
    nearest_stations_info = stations.iloc[indices[0]].copy() 
    nearest_stations_info['DISTANCE(°)'] = distances[0]

    return nearest_stations_info

### Get the average value for the near weather station exclude the one used as predicted weather station

In [22]:
def average_values_for_nearest_stations_exclude(latitude, longitude, station_to_exclude):
    nearest_stations_info = six_nearest_weather_stations(latitude, longitude)
    nearest_station_ids = nearest_stations_info['station_id'].tolist()
    
    # Remove the specific station ID from the list
    if station_to_exclude in nearest_station_ids:
        nearest_station_ids.remove(station_to_exclude)
    
    filtered_data = data[data['station_id'].isin(nearest_station_ids)]
    average_values = filtered_data.groupby(['YEAR', 'MONTH'])[['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']].mean().reset_index()
    average_values.rename(columns={
        'AWND': 'AWND_avg',
        'PRCP': 'PRCP_avg',
        'TAVG': 'TAVG_avg',
        'TMAX': 'TMAX_avg',
        'TMIN': 'TMIN_avg'
    }, inplace=True)
    return average_values

### Expand the dataset

In [23]:
def get_analysis_data(latitude, longitude, weather_station):
    weather_station_data = data[data['station_id']== weather_station]
    
    for var in ['AWND', 'PRCP', 'TAVG', 'TMAX','TMIN']:
        for year in range(1, 4):
            year_lag = year*12
            weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
            
    neighbor_data = average_values_for_nearest_stations_exclude(latitude, longitude, weather_station)
    merged_data = pd.merge(weather_station_data, neighbor_data, on=['YEAR', 'MONTH'], how='inner')
    
    for var in ['AWND_avg', 'PRCP_avg', 'TAVG_avg', 'TMAX_avg','TMIN_avg']:
        for year in range(1, 4):
            year_lag = year*12
            merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
    
    merged_data_final = merged_data.drop(columns=['AWND', 'TAVG', 'TMAX', 'TMIN', 'AWND_avg', 'PRCP_avg', 'TAVG_avg','TMAX_avg','TMIN_avg'])
    merged_data_final = merged_data_final.dropna()
    return merged_data_final

In [24]:
data[data['station_id']=='GHCND:USW00014835']

Unnamed: 0,station_id,Latitude,Longitude,Elevation,Month,PRCP,AWND,TMAX,TMIN,TAVG,YEAR,MONTH
31245,GHCND:USW00014835,40.4124,-86.9474,181.7,2014-01-01,38.9,4.5,-1.98,-12.31,-7.14,2014,1
31246,GHCND:USW00014835,40.4124,-86.9474,181.7,2014-02-01,61.8,3.7,-2.14,-12.60,-7.37,2014,2
31247,GHCND:USW00014835,40.4124,-86.9474,181.7,2014-03-01,38.2,4.1,6.36,-5.25,0.55,2014,3
31248,GHCND:USW00014835,40.4124,-86.9474,181.7,2014-04-01,106.7,4.3,16.74,4.22,10.48,2014,4
31249,GHCND:USW00014835,40.4124,-86.9474,181.7,2014-05-01,96.2,3.5,22.61,10.78,16.70,2014,5
...,...,...,...,...,...,...,...,...,...,...,...,...
31349,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-09-01,28.2,2.3,25.15,12.68,18.92,2022,9
31350,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-10-01,44.5,3.1,18.68,5.01,11.85,2022,10
31351,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-11-01,37.6,3.4,11.49,0.31,5.90,2022,11
31352,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-12-01,13.7,3.9,3.59,-4.29,-0.35,2022,12


In [25]:
data[data['station_id']=='GHCND:USW00014835']['PRCP'].max()

233.8

In [26]:
data[data['station_id']=='GHCND:USW00014835']['PRCP'].min()

11.0

In [27]:
pd.set_option('display.max_columns', None)

In [28]:
get_analysis_data(40.41236,-86.94739,'GHCND:USW00014835')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'

Unnamed: 0,station_id,Latitude,Longitude,Elevation,Month,PRCP,YEAR,MONTH,AWND_lag_1_year,AWND_lag_2_year,AWND_lag_3_year,PRCP_lag_1_year,PRCP_lag_2_year,PRCP_lag_3_year,TAVG_lag_1_year,TAVG_lag_2_year,TAVG_lag_3_year,TMAX_lag_1_year,TMAX_lag_2_year,TMAX_lag_3_year,TMIN_lag_1_year,TMIN_lag_2_year,TMIN_lag_3_year,AWND_avg_lag_1_year,AWND_avg_lag_2_year,AWND_avg_lag_3_year,PRCP_avg_lag_1_year,PRCP_avg_lag_2_year,PRCP_avg_lag_3_year,TAVG_avg_lag_1_year,TAVG_avg_lag_2_year,TAVG_avg_lag_3_year,TMAX_avg_lag_1_year,TMAX_avg_lag_2_year,TMAX_avg_lag_3_year,TMIN_avg_lag_1_year,TMIN_avg_lag_2_year,TMIN_avg_lag_3_year
36,GHCND:USW00014835,40.4124,-86.9474,181.7,2017-01-01,101.7,2017,1,4.0,3.4,4.5,27.4,33.1,38.9,-2.78,-4.93,-7.14,1.95,-0.42,-1.98,-7.51,-9.44,-12.31,4.400,3.850,4.850,20.725000,39.425,47.800,-1.985000,-3.8375,-7.0600,2.610000,0.6450,-1.6925,-6.572500,-8.3150,-12.4250
37,GHCND:USW00014835,40.4124,-86.9474,181.7,2017-02-01,23.0,2017,2,4.1,3.9,3.7,41.9,11.0,61.8,-0.07,-8.00,-7.37,4.65,-2.80,-2.14,-4.78,-13.20,-12.60,4.550,4.150,3.900,39.450000,25.325,58.175,1.022500,-7.3875,-6.3150,5.602500,-2.1975,-1.3975,-3.550000,-12.5775,-11.2300
38,GHCND:USW00014835,40.4124,-86.9474,181.7,2017-03-01,93.3,2017,3,3.9,3.6,4.1,109.0,20.0,38.2,8.00,2.33,0.55,13.32,7.93,6.36,2.68,-3.26,-5.25,4.150,3.650,4.275,86.425000,67.550,38.475,8.805000,3.1025,1.3575,14.092500,8.4825,7.2575,3.520000,-2.2775,-4.5425
39,GHCND:USW00014835,40.4124,-86.9474,181.7,2017-04-01,116.8,2017,4,4.0,3.9,4.3,79.7,58.6,106.7,10.26,10.79,10.48,16.41,17.13,16.74,4.11,4.44,4.22,4.075,4.125,4.450,113.200000,100.575,130.675,11.315000,11.7275,11.1175,17.522500,17.6475,17.2400,5.102500,5.8025,4.9900
40,GHCND:USW00014835,40.4124,-86.9474,181.7,2017-05-01,184.0,2017,5,3.1,3.1,3.5,49.7,104.8,96.2,15.65,17.69,16.70,21.72,23.74,22.61,9.57,11.64,10.78,3.500,3.725,3.750,71.375000,78.200,98.750,16.580000,18.5375,17.0650,22.132500,24.1525,22.8200,11.027500,12.9225,11.3025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-09-01,28.2,2022,9,2.7,2.6,2.2,69.7,55.6,58.5,20.89,19.81,21.52,27.47,26.12,27.63,14.32,13.50,15.41,3.200,2.925,2.650,65.375000,20.350,56.425,21.120000,19.1175,22.4275,27.232500,25.3950,28.4525,15.010000,12.8425,16.4000
105,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-10-01,44.5,2022,10,2.6,3.2,3.1,233.8,74.0,88.9,14.96,12.57,12.35,19.68,18.83,18.46,10.24,6.32,6.25,3.150,3.350,3.500,162.125000,115.500,116.675,15.880000,11.9575,13.0000,20.625000,17.9600,18.7225,11.135000,5.9500,7.2775
106,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-11-01,37.6,2022,11,3.0,3.5,3.1,32.1,67.8,42.7,4.07,9.15,2.44,9.15,14.84,7.27,-1.02,3.47,-2.39,3.600,4.100,3.600,31.800000,83.825,65.150,4.572500,8.4800,2.9375,9.747500,14.1225,7.5850,-0.605000,2.8350,-1.7100
107,GHCND:USW00014835,40.4124,-86.9474,181.7,2022-12-01,13.7,2022,12,3.4,3.3,3.2,73.8,51.7,60.3,4.25,1.80,1.90,9.59,6.09,7.02,-1.09,-2.50,-3.22,3.950,3.575,3.675,96.400000,46.650,63.550,4.905000,4.1225,2.4950,10.220000,8.8850,7.3875,-0.410000,-0.6375,-2.3975


In [29]:
merged_data_final = get_analysis_data(40.41236,-86.94739,'GHCND:USW00014835')
merged_data_final.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'

Index(['station_id', 'Latitude', 'Longitude', 'Elevation', 'Month', 'PRCP',
       'YEAR', 'MONTH', 'AWND_lag_1_year', 'AWND_lag_2_year',
       'AWND_lag_3_year', 'PRCP_lag_1_year', 'PRCP_lag_2_year',
       'PRCP_lag_3_year', 'TAVG_lag_1_year', 'TAVG_lag_2_year',
       'TAVG_lag_3_year', 'TMAX_lag_1_year', 'TMAX_lag_2_year',
       'TMAX_lag_3_year', 'TMIN_lag_1_year', 'TMIN_lag_2_year',
       'TMIN_lag_3_year', 'AWND_avg_lag_1_year', 'AWND_avg_lag_2_year',
       'AWND_avg_lag_3_year', 'PRCP_avg_lag_1_year', 'PRCP_avg_lag_2_year',
       'PRCP_avg_lag_3_year', 'TAVG_avg_lag_1_year', 'TAVG_avg_lag_2_year',
       'TAVG_avg_lag_3_year', 'TMAX_avg_lag_1_year', 'TMAX_avg_lag_2_year',
       'TMAX_avg_lag_3_year', 'TMIN_avg_lag_1_year', 'TMIN_avg_lag_2_year',
       'TMIN_avg_lag_3_year'],
      dtype='object')

In [30]:
years = merged_data_final['YEAR'].unique()
years[0]

2017

### Lasso: top 5 feature selection

In [31]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

def lasso_mse(latitude, longitude, weather_station):
    
    merged_data_final = get_analysis_data(latitude, longitude, weather_station)
        
    # Define the training and testing sets
    train_df = merged_data_final[merged_data_final['YEAR'].between(2017, 2021)]
    test_df = merged_data_final[merged_data_final['YEAR'] == 2022]

    # Remove rows with missing values
    train_df = train_df.dropna()
    test_df = test_df.dropna()

    # Select features and target variable
    X_train = train_df.drop(columns=['station_id', 'Latitude', 'Longitude', 'Elevation', 'Month', 'PRCP', 'YEAR'])
    y_train = train_df['PRCP']
    X_test = test_df[X_train.columns]
    y_test = test_df['PRCP']

    # Use LassoCV for feature selection and model fitting
    ## expand the cross validation to 10 and max iteration to 10000
    lasso = LassoCV(cv=10, max_iter=10000).fit(X_train, y_train)

    # Predict using the model
    y_pred_train = lasso.predict(X_train)
    y_pred_test = lasso.predict(X_test)

    # Calculate and store the MSE value
    mse_test = mean_squared_error(y_test, y_pred_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
        
    X_pred = test_df.drop(columns=['station_id', 'Latitude', 'Longitude', 'Elevation', 'Month', 'PRCP', 'YEAR'])
    Y_pred = np.round(lasso.predict(X_pred), 4)
    
    return mse_train, mse_test, Y_pred.tolist()

In [32]:
six_nearest_weather_stations(40.41236,-86.94739)



Unnamed: 0,station_id,Latitude,Longitude,DISTANCE(°)
31245,GHCND:USW00014835,40.4124,-86.9474,4.1e-05
54850,GHCND:USW00053842,39.8248,-86.2958,0.87738
70199,GHCND:USW00093819,39.7252,-86.2816,0.956799
70526,GHCND:USW00093823,39.4429,-87.3221,1.039356
9758,GHCND:USW00003868,39.4429,-87.3221,1.039356
16847,GHCND:USW00004846,41.4535,-86.998,1.042369


In [33]:
lasso_mse(40.41236,-86.94739,'GHCND:USW00014835') # Purdue Airport

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'

(1267.6962728433825,
 2177.7718671731786,
 [89.9137,
  67.1152,
  86.1237,
  78.0798,
  109.7686,
  97.0245,
  96.4973,
  68.4899,
  56.729,
  103.2333,
  75.6619,
  69.5969])

In [34]:
six_nearest_weather_stations(41.96017,-87.93164)



Unnamed: 0,station_id,Latitude,Longitude,DISTANCE(°)
16738,GHCND:USW00004845,42.5953,-87.9383,0.635165
78719,GHCND:USW00094818,42.7586,-87.8178,0.806505
31571,GHCND:USW00014839,42.9549,-87.9045,0.9951
16847,GHCND:USW00004846,41.4535,-86.998,1.062261
79618,GHCND:USW00094871,42.1289,-86.4152,1.525798
32225,GHCND:USW00014848,41.7072,-86.3163,1.635028


In [35]:
data[data['station_id']=='GHCND:USW00094846']

Unnamed: 0,station_id,Latitude,Longitude,Elevation,Month,PRCP,AWND,TMAX,TMIN,TAVG,YEAR,MONTH


In [36]:
#lasso_mse(41.96017,-87.93164,'GHCND:USW00094846') # Chicago Ohare:No record

In [37]:
station_counts = data.groupby('station_id').size()

In [38]:
station_counts

station_id
GHCND:USC00026250     32
GHCND:USC00040010    108
GHCND:USC00040014      9
GHCND:USC00040029    109
GHCND:USC00040136    109
                    ... 
GHCND:USW00096405     47
GHCND:USW00096406     73
GHCND:USW00096407     79
GHCND:USW00096408     83
GHCND:USW00096409     54
Length: 891, dtype: int64

### Loop all the valid weather stations

Filter weather stations if it has records from 2010 to 2023.

In [39]:
stations_with_enough_records = station_counts[station_counts >= 109].index

In [40]:
valid_stations = data[
    (data['YEAR'] == 2014) & 
    (data['station_id'].isin(stations_with_enough_records))
][['station_id', 'Latitude', 'Longitude']].drop_duplicates()

In [41]:
valid_stations 

Unnamed: 0,station_id,Latitude,Longitude
149,GHCND:USC00040029,41.1933,-120.9450
258,GHCND:USC00040136,32.8358,-116.7770
367,GHCND:USC00040144,34.1821,-118.1380
637,GHCND:USC00040192,33.8647,-117.8420
746,GHCND:USC00140010,38.9266,-97.2129
...,...,...,...
81677,GHCND:USW00094958,41.6234,-98.9489
81894,GHCND:USW00094961,48.7261,-94.6122
82112,GHCND:USW00094967,46.8997,-95.0668
82480,GHCND:USW00094978,41.7630,-96.1797


In [42]:
# Apply the function and create a new column 'results' to store the tuples
valid_stations['results'] = valid_stations.apply(lambda row: lasso_mse(row['Latitude'], row['Longitude'], row['station_id']), axis=1)

# Now, split the 'results' column into two separate columns
valid_stations[['overall_mse_train','overall_mse_test', 'Pred_2023']] = pd.DataFrame(valid_stations['results'].tolist(), index=valid_stations.index)

# Optionally, drop the 'results' column if it's no longer needed
valid_stations.drop(columns=['results'], inplace=True)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [43]:
pd.set_option('display.width', None) 
pd.set_option('display.max_colwidth', None) 

In [44]:
valid_stations

Unnamed: 0,station_id,Latitude,Longitude,overall_mse_train,overall_mse_test,Pred_2023
149,GHCND:USC00040029,41.1933,-120.9450,639.919298,949.003635,"[53.8477, 53.896, 50.5062, 38.8573, 29.2056, 19.572, 13.5941, 14.0819, 13.7476, 19.6699, 46.0836, 36.67]"
258,GHCND:USC00040136,32.8358,-116.7770,1995.381327,703.955014,"[40.0327, 23.433, 78.1416, 59.2307, 25.4007, 17.0783, 11.2837, 12.1438, 14.7395, 30.8742, 18.0329, 53.3168]"
367,GHCND:USC00040144,34.1821,-118.1380,4214.146254,1189.875790,"[28.922, 27.6956, 67.7121, 74.0895, 29.5528, 26.8897, 26.8897, 26.8897, 26.8897, 26.8897, 27.6956, 40.1701]"
637,GHCND:USC00040192,33.8647,-117.8420,1915.085703,430.731950,"[20.5993, 17.0773, 44.3866, 34.3471, 18.8227, 16.7099, 16.618, 16.618, 17.2372, 17.0365, 17.3835, 32.4906]"
746,GHCND:USC00140010,38.9266,-97.2129,3157.797167,2290.340205,"[54.2027, 46.5818, 68.0146, 64.0767, 128.0819, 79.5512, 96.506, 91.5762, 63.3111, 61.1996, 49.1478, 46.1179]"
...,...,...,...,...,...,...
81677,GHCND:USW00094958,41.6234,-98.9489,1718.092668,1861.300558,"[31.2671, 14.128, 72.2063, 45.2925, 81.5006, 84.6225, 120.9977, 65.7544, 85.1416, 47.1193, 35.9639, 24.9682]"
81894,GHCND:USW00094961,48.7261,-94.6122,837.108818,679.133085,"[7.4281, 6.9981, 17.3398, 45.2885, 47.0399, 54.6063, 56.9798, 79.6224, 45.8182, 53.6015, 24.8969, 11.2106]"
82112,GHCND:USW00094967,46.8997,-95.0668,730.047748,1313.327990,"[0.4555, 4.9434, 30.5534, 44.4873, 79.1403, 88.6703, 96.3506, 70.3823, 108.2983, 58.8049, 37.709, 14.3132]"
82480,GHCND:USW00094978,41.7630,-96.1797,1549.503572,1005.412358,"[29.5275, 28.9905, 75.3481, 41.3022, 103.9502, 66.7867, 77.2848, 70.9389, 74.4404, 72.199, 28.9788, 34.472]"


In [45]:
valid_stations['overall_mse_train'].mean()

2200.8116737074993

In [46]:
valid_stations['overall_mse_test'].mean()

2068.4377266984015

In [47]:
six_nearest_weather_stations(47.78000,-121.70000) #ALPINE MEADOWS, WA US



Unnamed: 0,station_id,Latitude,Longitude,DISTANCE(°)
76762,GHCND:USW00094290,47.6872,-122.255,0.562705
76326,GHCND:USW00094248,47.4951,-122.214,0.587677
14507,GHCND:USW00004205,48.1606,-122.159,0.59627
45443,GHCND:USW00024222,47.9232,-122.283,0.600329
46532,GHCND:USW00024237,47.2767,-121.337,0.620548
46314,GHCND:USW00024234,47.5455,-122.315,0.658191


In [48]:
six_nearest_weather_stations(48.88000,-121.26000) #BEAVER PASS, WA US



Unnamed: 0,station_id,Latitude,Longitude,DISTANCE(°)
14574,GHCND:USW00004223,48.5405,-121.446,0.387113
14507,GHCND:USW00004205,48.1606,-122.159,1.151407
45007,GHCND:USW00024217,48.7991,-122.541,1.283552
45443,GHCND:USW00024222,47.9232,-122.283,1.400712
76762,GHCND:USW00094290,47.6872,-122.255,1.553318
46532,GHCND:USW00024237,47.2767,-121.337,1.605148
