## Data cleaning

In [1]:
import pandas as pd

In [2]:
data = pd.read_excel('/Users/yunlei/Desktop/MGMT 478/Combined dataset_nonsort 2 .xlsx')

In [3]:
data.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN,Unnamed: 11
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01,8.5,1.01,23.5,29.5,17.5,
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02,7.6,0.61,26.0,32.8,19.1,
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03,7.2,3.22,44.8,55.1,34.6,
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04,8.1,2.49,58.1,70.4,45.8,
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05,6.7,5.55,64.7,75.2,54.2,


In [4]:
data.shape

(4992, 12)

In [5]:
data = data.drop("Unnamed: 11", axis = 1)#this column has some useless comment

In [6]:
data.shape

(4992, 11)

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
# Convert DATE column to datetime format
data['DATE'] = pd.to_datetime(data['DATE'])

In [9]:
# Extract year and month from DATE as new features
data['YEAR'] = data['DATE'].dt.year
data['MONTH'] = data['DATE'].dt.month

In [10]:
data.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN,YEAR,MONTH
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01-01,8.5,1.01,23.5,29.5,17.5,2010,1
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02-01,7.6,0.61,26.0,32.8,19.1,2010,2
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03-01,7.2,3.22,44.8,55.1,34.6,2010,3
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04-01,8.1,2.49,58.1,70.4,45.8,2010,4
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05-01,6.7,5.55,64.7,75.2,54.2,2010,5


In [11]:
# # Drop the 'SNOW' column
# The "SNOW" column have been dropped before we use the excel
# data_cleaned = data.drop(['SNOW'], axis=1)

In [12]:
# Convert non-numeric to numeric
for column in ['LATITUDE','LONGITUDE','ELEVATION','AWND', 'TAVG', 'TMAX', 'TMIN']:
    data[column] = pd.to_numeric(data[column], errors='coerce')

In [13]:
data.isnull().sum()

STATION       0
NAME          0
LATITUDE      0
LONGITUDE     0
ELEVATION     0
DATE          0
AWND          5
PRCP         37
TAVG         17
TMAX         25
TMIN         25
YEAR          0
MONTH         0
dtype: int64

In [14]:
# Imputer missing data as median of the column
imputer = SimpleImputer(strategy='median')
data[['AWND', 'TAVG', 'TMAX', 'TMIN', 'PRCP']] = imputer.fit_transform(data[['AWND', 'TAVG', 'TMAX', 'TMIN', 'PRCP']])

In [15]:
data.isnull().sum()

STATION      0
NAME         0
LATITUDE     0
LONGITUDE    0
ELEVATION    0
DATE         0
AWND         0
PRCP         0
TAVG         0
TMAX         0
TMIN         0
YEAR         0
MONTH        0
dtype: int64

In [16]:
data.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN,YEAR,MONTH
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01-01,8.5,1.01,23.5,29.5,17.5,2010,1
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02-01,7.6,0.61,26.0,32.8,19.1,2010,2
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03-01,7.2,3.22,44.8,55.1,34.6,2010,3
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04-01,8.1,2.49,58.1,70.4,45.8,2010,4
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05-01,6.7,5.55,64.7,75.2,54.2,2010,5


In [17]:
data[data['STATION']=='USW00014835']['PRCP'].max()

9.61

In [19]:
data[data['STATION']=='USW00014835']['PRCP'].min()

0.43

## Using a specific weather station to predict

### Get six near weather station around the specific weather station

In [17]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

stations = data[['STATION', 'NAME', 'LATITUDE', 'LONGITUDE']].drop_duplicates()
coordinates = stations[['LATITUDE', 'LONGITUDE']]
neighbors_model = NearestNeighbors(n_neighbors=6)
neighbors_model.fit(coordinates)

def six_nearest_weather_stations(latitude, longitude):
    query_coordinates = np.array([[latitude, longitude]])
    distances, indices = neighbors_model.kneighbors(query_coordinates)
    nearest_stations_info = stations.iloc[indices[0]].copy() 
    nearest_stations_info['DISTANCE(°)'] = distances[0]

    return nearest_stations_info

### Get the average value for the near weather station exclude the one used as predicted weather station

In [18]:
def average_values_for_nearest_stations_exclude(latitude, longitude, station_to_exclude):
    nearest_stations_info = six_nearest_weather_stations(latitude, longitude)
    nearest_station_ids = nearest_stations_info['STATION'].tolist()
    
    # Remove the specific station ID from the list
    if station_to_exclude in nearest_station_ids:
        nearest_station_ids.remove(station_to_exclude)
    
    filtered_data = data[data['STATION'].isin(nearest_station_ids)]
    average_values = filtered_data.groupby(['YEAR', 'MONTH'])[['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']].mean().reset_index()
    average_values.rename(columns={
        'AWND': 'AWND_avg',
        'PRCP': 'PRCP_avg',
        'TAVG': 'TAVG_avg',
        'TMAX': 'TMAX_avg',
        'TMIN': 'TMIN_avg'
    }, inplace=True)
    return average_values

### Expand the dataset

In [19]:
def get_analysis_data(latitude, longitude, weather_station):
    weather_station_data = data[data['STATION']== weather_station]
    
    for var in ['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']:
        for year in range(1, 6):
            year_lag = year*12
            weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
            
    neighbor_data = average_values_for_nearest_stations_exclude(latitude, longitude, weather_station)
    merged_data = pd.merge(weather_station_data, neighbor_data, on=['YEAR', 'MONTH'], how='inner')
    
    for var in ['AWND_avg', 'PRCP_avg', 'TAVG_avg', 'TMAX_avg', 'TMIN_avg']:
        for year in range(1, 6):
            year_lag = year*12
            merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
    
    merged_data_final = merged_data.drop(columns=['AWND', 'TAVG', 'TMAX', 'TMIN', 'AWND_avg', 'PRCP_avg', 'TAVG_avg', 'TMAX_avg', 'TMIN_avg'])
    merged_data_final = merged_data_final.dropna()
    return merged_data_final

### Lasso: top 5 feature selection

In [20]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

def lasso_mse(latitude, longitude, weather_station):
    
    merged_data_final = get_analysis_data(latitude, longitude, weather_station)
    best_mse = 50  # 设置一个很高的初始值
    best_model = None  # 用于存储最佳模型
    best_year = None 
    
    # Initialize dictionaries to store MSE values for each year and top features for each year
    mse_values = {}
    top_features_per_year = {}

    # Initialize lists to accumulate actual and predicted values for all years
    all_actuals_train = []
    all_predictions_train = []
    all_actuals_test = []
    all_predictions_test = []

    # Set the starting and ending years for the time window
    years = merged_data_final['YEAR'].unique()
    start_year = years[0] + 4
    end_year = 2022

    # Loop through each time window
    for year in range(start_year, end_year + 1):
        
        # Define the training and testing sets
        train_df = merged_data_final[merged_data_final['YEAR'].between(year - 4, year)]
        test_df = merged_data_final[merged_data_final['YEAR'] == year + 1]

        # Remove rows with missing values
        train_df = train_df.dropna()
        test_df = test_df.dropna()

        # Select features and target variable
        X_train = train_df.drop(columns=['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'PRCP', 'YEAR'])
        y_train = train_df['PRCP']
        X_test = test_df[X_train.columns]
        y_test = test_df['PRCP']

        # Use LassoCV for feature selection and model fitting
        ## expand the cross validation to 10 and max iteration to 10000
        lasso = LassoCV(cv=10, max_iter=10000).fit(X_train, y_train)

        # Predict using the model
        y_pred_train = lasso.predict(X_train)
        y_pred_test = lasso.predict(X_test)

        # Calculate and store the MSE value
        mse = mean_squared_error(y_test, y_pred_test)
        mse_values[year] = mse
        
        # Restore the lowest MSE as best lasso model
        if mse < best_mse:
            best_mse = mse
            best_model = lasso
            best_year = year

        # Accumulate actual and predicted values for all years
        all_actuals_train.extend(y_train.tolist())
        all_predictions_train.extend(y_pred_train.tolist())

        # Accumulate actual and predicted values for testing data
        all_actuals_test.extend(y_test.tolist())
        all_predictions_test.extend(y_pred_test.tolist())

        
    Pred_2024 = []   
    pred_df = merged_data_final[merged_data_final['YEAR']==2023]
    X_pred = pred_df.drop(columns=['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'PRCP', 'YEAR'])
    Y_pred = np.round(best_model.predict(X_pred), 4)# Keep four decimal for output predicted value
    Pred_2024.extend(Y_pred.tolist())
    
    # After looping, calculate the overall MSE
    overall_mse_train = mean_squared_error(all_actuals_train, all_predictions_train)
    overall_mse_test = mean_squared_error(all_actuals_test, all_predictions_test)
    
    return overall_mse_train, overall_mse_test, Pred_2024, best_year

In [21]:
six_nearest_weather_stations(40.41236,-86.94739)



Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DISTANCE(°)
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,0.0
671,USW00093819,"INDIANAPOLIS INTERNATIONAL AIRPORT, IN US",39.72515,-86.2816,0.956835
336,USC00120784,"BLOOMINGTON INDIANA UNIVERSITY, IN US",39.17399,-86.52076,1.309799
839,USW00053866,"SHELBYVILLE MUNICIPAL AIRPORT, IN US",39.58545,-85.79982,1.41446
168,USW00014848,"SOUTH BEND AIRPORT, IN US",41.70722,-86.31628,1.440473
503,USW00014827,"FORT WAYNE INTERNATIONAL AIRPORT, IN US",40.97248,-85.20636,1.828912


In [22]:
lasso_mse(40.41236,-86.94739,'USW00014835') # Purdue Airport

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'

(1.7862871041920205,
 2.9166415310835068,
 [2.2772,
  2.4038,
  2.7387,
  3.2746,
  3.842,
  4.248,
  4.4493,
  4.2201,
  4.0378,
  3.5163,
  2.7213,
  2.6714],
 2019)

In [23]:
six_nearest_weather_stations(41.96017,-87.93164)



Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DISTANCE(°)
1345,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,0.0
1176,USW00094892,"CHICAGO WEST CHICAGO DUPAGE AIRPORT, IL US",41.89641,-88.25119,0.325849
168,USW00014848,"SOUTH BEND AIRPORT, IN US",41.70722,-86.31628,1.635045
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,1.834247
671,USW00093819,"INDIANAPOLIS INTERNATIONAL AIRPORT, IN US",39.72515,-86.2816,2.778119
503,USW00014827,"FORT WAYNE INTERNATIONAL AIRPORT, IN US",40.97248,-85.20636,2.898738


In [24]:
lasso_mse(41.96017,-87.93164,'USW00094846') # Chicago Ohare

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_station_data[f'

(3.4122847790249393,
 3.931411206861204,
 [2.7187,
  2.41,
  3.4443,
  3.8959,
  4.5658,
  4.9497,
  4.2622,
  4.3353,
  4.4737,
  4.4536,
  2.5781,
  2.5009],
 2021)

In [25]:
station_counts = data.groupby('STATION').size()

In [26]:
station_counts

STATION
USC00114355    168
USC00116011    120
USC00120784    167
USC00174193    119
USC00174927    121
USC00213104    135
USC00251825    169
USC00459376    170
USC00473186    167
USS0021A01S    170
USS0021B48S    170
USS0021C28S    170
USW00003947    170
USW00004205     82
USW00013866    170
USW00014606    121
USW00014827    168
USW00014835    168
USW00014848    168
USW00023174    169
USW00023188    169
USW00023234    169
USW00053866    168
USW00054772    121
USW00093009    169
USW00093193    169
USW00093225    169
USW00093810    169
USW00093819    168
USW00094626    121
USW00094846    169
USW00094892    169
dtype: int64

### Loop all the valid weather stations

Filter weather stations if it has records from 2010 to 2023.

In [27]:
stations_with_enough_records = station_counts[station_counts >= 167].index

In [28]:
valid_stations = data[
    (data['YEAR'] == 2010) & 
    (data['STATION'].isin(stations_with_enough_records))
][['STATION', 'NAME', 'LATITUDE', 'LONGITUDE']].drop_duplicates()

In [29]:
valid_stations 

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739
168,USW00014848,"SOUTH BEND AIRPORT, IN US",41.70722,-86.31628
336,USC00120784,"BLOOMINGTON INDIANA UNIVERSITY, IN US",39.17399,-86.52076
503,USW00014827,"FORT WAYNE INTERNATIONAL AIRPORT, IN US",40.97248,-85.20636
671,USW00093819,"INDIANAPOLIS INTERNATIONAL AIRPORT, IN US",39.72515,-86.2816
839,USW00053866,"SHELBYVILLE MUNICIPAL AIRPORT, IN US",39.58545,-85.79982
1007,USW00093810,"CARBONDALE SOUTHERN ILLINOIS AIRPORT, IL US",37.78329,-89.24533
1176,USW00094892,"CHICAGO WEST CHICAGO DUPAGE AIRPORT, IL US",41.89641,-88.25119
1345,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164
1514,USC00114355,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094


In [30]:
# Apply the function and create a new column 'results' to store the tuples
valid_stations['results'] = valid_stations.apply(lambda row: lasso_mse(row['LATITUDE'], row['LONGITUDE'], row['STATION']), axis=1)

# Now, split the 'results' column into two separate columns
valid_stations[['overall_mse_train','overall_mse_test', 'Pred_2024', 'best_year_model']] = pd.DataFrame(valid_stations['results'].tolist(), index=valid_stations.index)

# Optionally, drop the 'results' column if it's no longer needed
valid_stations.drop(columns=['results'], inplace=True)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [31]:
pd.set_option('display.max_rows', None)

In [32]:
valid_stations

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,overall_mse_train,overall_mse_test,Pred_2024,best_year_model
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,1.786287,2.916642,"[2.2772, 2.4038, 2.7387, 3.2746, 3.842, 4.248,...",2019
168,USW00014848,"SOUTH BEND AIRPORT, IN US",41.70722,-86.31628,3.476347,3.217687,"[2.7212, 2.8951, 3.3304, 3.5049, 3.9736, 4.257...",2021
336,USC00120784,"BLOOMINGTON INDIANA UNIVERSITY, IN US",39.17399,-86.52076,3.706983,4.732532,"[4.5468, 3.7993, 4.9849, 4.8165, 4.2581, 4.703...",2021
503,USW00014827,"FORT WAYNE INTERNATIONAL AIRPORT, IN US",40.97248,-85.20636,2.154908,2.470251,"[2.623, 2.7478, 3.1722, 3.4459, 4.0376, 4.2598...",2021
671,USW00093819,"INDIANAPOLIS INTERNATIONAL AIRPORT, IN US",39.72515,-86.2816,3.248176,3.548488,"[3.7479, 3.5343, 4.0997, 4.2245, 4.3929, 4.579...",2021
839,USW00053866,"SHELBYVILLE MUNICIPAL AIRPORT, IN US",39.58545,-85.79982,1.982944,3.916168,"[5.5498, 3.5935, 4.6305, 3.5995, 3.5644, 5.402...",2022
1007,USW00093810,"CARBONDALE SOUTHERN ILLINOIS AIRPORT, IL US",37.78329,-89.24533,3.739967,4.304,"[3.5162, 3.3926, 3.8378, 3.9955, 4.1959, 4.863...",2019
1176,USW00094892,"CHICAGO WEST CHICAGO DUPAGE AIRPORT, IL US",41.89641,-88.25119,2.632566,3.863737,"[1.6651, 2.5883, 3.2159, 3.6096, 4.727, 3.7291...",2021
1345,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,3.412285,3.931411,"[2.7187, 2.41, 3.4443, 3.8959, 4.5658, 4.9497,...",2021
1514,USC00114355,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,3.067713,1.935305,"[1.6867, 1.4748, 2.603, 2.925, 3.7847, 4.2598,...",2021


In [33]:
print(valid_stations['Pred_2024'])

0       [2.2772, 2.4038, 2.7387, 3.2746, 3.842, 4.248,...
168     [2.7212, 2.8951, 3.3304, 3.5049, 3.9736, 4.257...
336     [4.5468, 3.7993, 4.9849, 4.8165, 4.2581, 4.703...
503     [2.623, 2.7478, 3.1722, 3.4459, 4.0376, 4.2598...
671     [3.7479, 3.5343, 4.0997, 4.2245, 4.3929, 4.579...
839     [5.5498, 3.5935, 4.6305, 3.5995, 3.5644, 5.402...
1007    [3.5162, 3.3926, 3.8378, 3.9955, 4.1959, 4.863...
1176    [1.6651, 2.5883, 3.2159, 3.6096, 4.727, 3.7291...
1345    [2.7187, 2.41, 3.4443, 3.8959, 4.5658, 4.9497,...
1514    [1.6867, 1.4748, 2.603, 2.925, 3.7847, 4.2598,...
2405    [1.9988, 2.0666, 1.6991, 1.0653, 0.7855, 0.098...
2574    [2.982, 2.6975, 2.3494, 1.972, 1.3836, 0.5592,...
2743    [1.5309, 1.6294, 1.3115, 0.9014, 0.806, 0.2303...
2912    [3.6826, 3.3463, 2.7914, 1.901, 0.9337, 0.0892...
3081    [1.7559, 1.5876, 1.2918, 1.0185, 0.742, 0.1792...
3250    [0.9079, 0.9055, 1.6182, 1.8744, 2.7474, 3.287...
3419    [1.4258, 1.2574, 2.4217, 2.7309, 3.4947, 4.217...
3586    [1.264

In [34]:
valid_stations['overall_mse_train'].mean()

4.702646421402193

In [35]:
valid_stations['overall_mse_test'].mean()

5.7429272177287745

In [36]:
six_nearest_weather_stations(47.78000,-121.70000) #ALPINE MEADOWS, WA US



Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DISTANCE(°)
4652,USS0021B48S,"ALPINE MEADOWS, WA US",47.78,-121.7,0.0
4570,USW00004205,"ARLINGTON MUNICIPAL AIRPORT, WA US",48.16056,-122.15889,0.596159
4822,USS0021C28S,"WHITE PASS E.S., WA US",46.64,-121.38,1.184061
4230,USS0021A01S,"BEAVER PASS, WA US",48.88,-121.26,1.184736
4400,USC00459376,"WINTHROP 1 WSW, WA US",48.47429,-120.18874,1.663113
2912,USW00093225,"SACRAMENTO METROPOLITAN AIRPORT, CA US",38.70069,-121.59479,9.07992


In [37]:
six_nearest_weather_stations(48.88000,-121.26000) #BEAVER PASS, WA US



Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DISTANCE(°)
4230,USS0021A01S,"BEAVER PASS, WA US",48.88,-121.26,0.0
4400,USC00459376,"WINTHROP 1 WSW, WA US",48.47429,-120.18874,1.145512
4570,USW00004205,"ARLINGTON MUNICIPAL AIRPORT, WA US",48.16056,-122.15889,1.151346
4652,USS0021B48S,"ALPINE MEADOWS, WA US",47.78,-121.7,1.184736
4822,USS0021C28S,"WHITE PASS E.S., WA US",46.64,-121.38,2.243212
2912,USW00093225,"SACRAMENTO METROPOLITAN AIRPORT, CA US",38.70069,-121.59479,10.184814
