## Data cleaning

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('/Users/yunlei/Desktop/MGMT 478/Combined dataset_nonsort.csv')

In [3]:
data.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,TAVG,TMAX,TMIN
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01,8.5,1.01,,23.5,29.5,17.5
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02,7.6,0.61,,26.0,32.8,19.1
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03,7.2,3.22,,44.8,55.1,34.6
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04,8.1,2.49,,58.1,70.4,45.8
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05,6.7,5.55,,64.7,75.2,54.2


In [4]:
from sklearn.impute import SimpleImputer

In [5]:
# Convert DATE column to datetime format
data['DATE'] = pd.to_datetime(data['DATE'])

In [6]:
# Extract year and month from DATE as new features
data['YEAR'] = data['DATE'].dt.year
data['MONTH'] = data['DATE'].dt.month

In [7]:
# Drop the 'SNOW' column
data_cleaned = data.drop(['SNOW'], axis=1)

In [8]:
# Convert non-numeric to numeric
for column in ['LATITUDE','LONGITUDE','ELEVATION','AWND', 'TAVG', 'TMAX', 'TMIN']:
    data_cleaned[column] = pd.to_numeric(data_cleaned[column], errors='coerce')

In [9]:
# Imputer missing data as median of the column
imputer = SimpleImputer(strategy='median')
data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']] = imputer.fit_transform(data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']])

In [10]:
data_cleaned.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN,YEAR,MONTH
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01-01,8.5,1.01,23.5,29.5,17.5,2010,1
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02-01,7.6,0.61,26.0,32.8,19.1,2010,2
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03-01,7.2,3.22,44.8,55.1,34.6,2010,3
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04-01,8.1,2.49,58.1,70.4,45.8,2010,4
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05-01,6.7,5.55,64.7,75.2,54.2,2010,5


In [11]:
data_cleaned.isnull().sum()

STATION      0
NAME         0
LATITUDE     0
LONGITUDE    0
ELEVATION    0
DATE         0
AWND         0
PRCP         8
TAVG         0
TMAX         0
TMIN         0
YEAR         0
MONTH        0
dtype: int64

## Using KNN for finding the nearest stations

In [12]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    '''
    Define the Haversack formula function to calculate distance by latitude and longitude
    '''

    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Mean radius of the Earth in kilometers
    return c * r

In [13]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

stations = data_cleaned[['STATION', 'LATITUDE', 'LONGITUDE']].drop_duplicates()
coordinates = stations[['LATITUDE', 'LONGITUDE']]
neighbors_model = NearestNeighbors(n_neighbors=6)
neighbors_model.fit(coordinates)

def six_nearest_weather_stations(latitude, longitude):
    query_coordinates = np.array([[latitude, longitude]])
    distances, indices = neighbors_model.kneighbors(query_coordinates)
    nearest_stations_info = stations.iloc[indices[0]].copy() 
    nearest_stations_info['DISTANCE(°)'] = distances[0]
    
    # Calculation of actual distance (km)
    nearest_stations_info['DISTANCE(KM)'] = nearest_stations_info.apply(
        lambda row: haversine(longitude, latitude, row['LONGITUDE'], row['LATITUDE']), 
        axis=1
    )

    return nearest_stations_info

In [14]:
# Simple average
def average_values_for_nearest_stations(latitude, longitude):
    nearest_stations_info = six_nearest_weather_stations(latitude, longitude)
    nearest_station_ids = nearest_stations_info['STATION'].tolist()
    filtered_data = data_cleaned[data_cleaned['STATION'].isin(nearest_station_ids)]
    average_values = filtered_data.groupby(['YEAR', 'MONTH'])[['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']].mean().reset_index()
    return average_values

In [15]:
average_values_for_nearest_stations(40,-86)



Unnamed: 0,YEAR,MONTH,AWND,PRCP,TAVG,TMAX,TMIN
0,2010,1,9.316667,1.213333,24.000000,30.216667,17.783333
1,2010,2,8.500000,1.191667,25.700000,32.316667,19.083333
2,2010,3,7.866667,2.888333,44.133333,54.483333,33.833333
3,2010,4,9.216667,3.263333,57.683333,69.633333,45.750000
4,2010,5,8.116667,5.873333,64.466667,74.383333,54.550000
...,...,...,...,...,...,...,...
163,2023,8,6.783333,3.235000,72.400000,82.266667,62.516667
164,2023,9,6.100000,1.390000,67.950000,79.350000,56.550000
165,2023,10,7.983333,3.301667,56.416667,66.266667,46.583333
166,2023,11,8.466667,0.793333,43.583333,54.400000,32.766667


## Using Purdue Airport to predict

### Expand the dataset

In [16]:
purdue_data = data_cleaned[data_cleaned['STATION']=='USW00014835']

In [17]:
purdue_data

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN,YEAR,MONTH
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01-01,8.5,1.01,23.5,29.5,17.5,2010,1
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02-01,7.6,0.61,26.0,32.8,19.1,2010,2
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03-01,7.2,3.22,44.8,55.1,34.6,2010,3
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04-01,8.1,2.49,58.1,70.4,45.8,2010,4
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05-01,6.7,5.55,64.7,75.2,54.2,2010,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-08-01,5.1,3.54,71.9,81.8,61.9,2023,8
164,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-09-01,4.9,0.59,67.5,79.8,55.1,2023,9
165,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-10-01,6.3,3.21,55.7,66.3,45.2,2023,10
166,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-11-01,6.5,0.48,42.6,53.8,31.5,2023,11


In [18]:
purdue_data['DATE'].unique() # each year recorded

<DatetimeArray>
['2010-01-01 00:00:00', '2010-02-01 00:00:00', '2010-03-01 00:00:00',
 '2010-04-01 00:00:00', '2010-05-01 00:00:00', '2010-06-01 00:00:00',
 '2010-07-01 00:00:00', '2010-08-01 00:00:00', '2010-09-01 00:00:00',
 '2010-10-01 00:00:00',
 ...
 '2023-03-01 00:00:00', '2023-04-01 00:00:00', '2023-05-01 00:00:00',
 '2023-06-01 00:00:00', '2023-07-01 00:00:00', '2023-08-01 00:00:00',
 '2023-09-01 00:00:00', '2023-10-01 00:00:00', '2023-11-01 00:00:00',
 '2023-12-01 00:00:00']
Length: 168, dtype: datetime64[ns]

In [19]:
for var in ['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']:
    for year in range(1, 14):
        year_lag = year*12
        purdue_data[f'{var}_lag_{year}_year'] = purdue_data[var].shift(year_lag)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  purdue_data[f'{var}_lag_{year}_year'] = purdue_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  purdue_data[f'{var}_lag_{year}_year'] = purdue_data[var].shift(year_lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  purdue_data[f'{var}_lag_{year}_year'] = purdue_data[var].sh

In [20]:
purdue_data.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'PRCP', 'TAVG', 'TMAX', 'TMIN', 'YEAR', 'MONTH', 'AWND_lag_1_year',
       'AWND_lag_2_year', 'AWND_lag_3_year', 'AWND_lag_4_year',
       'AWND_lag_5_year', 'AWND_lag_6_year', 'AWND_lag_7_year',
       'AWND_lag_8_year', 'AWND_lag_9_year', 'AWND_lag_10_year',
       'AWND_lag_11_year', 'AWND_lag_12_year', 'AWND_lag_13_year',
       'PRCP_lag_1_year', 'PRCP_lag_2_year', 'PRCP_lag_3_year',
       'PRCP_lag_4_year', 'PRCP_lag_5_year', 'PRCP_lag_6_year',
       'PRCP_lag_7_year', 'PRCP_lag_8_year', 'PRCP_lag_9_year',
       'PRCP_lag_10_year', 'PRCP_lag_11_year', 'PRCP_lag_12_year',
       'PRCP_lag_13_year', 'TAVG_lag_1_year', 'TAVG_lag_2_year',
       'TAVG_lag_3_year', 'TAVG_lag_4_year', 'TAVG_lag_5_year',
       'TAVG_lag_6_year', 'TAVG_lag_7_year', 'TAVG_lag_8_year',
       'TAVG_lag_9_year', 'TAVG_lag_10_year', 'TAVG_lag_11_year',
       'TAVG_lag_12_year', 'TAVG_lag_13_year', 'TMAX_lag_1_year',
 

In [21]:
six_nearest_weather_stations(40.41236,-86.94739)



Unnamed: 0,STATION,LATITUDE,LONGITUDE,DISTANCE(°),DISTANCE(KM)
0,USW00014835,40.41236,-86.94739,0.0,0.0
671,USW00093819,39.72515,-86.2816,0.956835,95.125159
336,USC00120784,39.17399,-86.52076,1.309799,142.442545
839,USW00053866,39.58545,-85.79982,1.41446,134.198553
168,USW00014848,41.70722,-86.31628,1.440473,153.395988
503,USW00014827,40.97248,-85.20636,1.828912,159.449673


In [22]:
def average_values_for_nearest_stations_exclude(latitude, longitude, station_to_exclude):
    nearest_stations_info = six_nearest_weather_stations(latitude, longitude)
    nearest_station_ids = nearest_stations_info['STATION'].tolist()
    
    # Remove the specific station ID from the list
    if station_to_exclude in nearest_station_ids:
        nearest_station_ids.remove(station_to_exclude)
    
    filtered_data = data_cleaned[data_cleaned['STATION'].isin(nearest_station_ids)]
    average_values = filtered_data.groupby(['YEAR', 'MONTH'])[['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']].mean().reset_index()
    average_values.rename(columns={
        'AWND': 'AWND_avg',
        'PRCP': 'PRCP_avg',
        'TAVG': 'TAVG_avg',
        'TMAX': 'TMAX_avg',
        'TMIN': 'TMIN_avg'
    }, inplace=True)
    return average_values

In [23]:
average_values_for_nearest_stations_exclude(40.41236,-86.94739,'USW00014835')



Unnamed: 0,YEAR,MONTH,AWND_avg,PRCP_avg,TAVG_avg,TMAX_avg,TMIN_avg
0,2010,1,9.48,1.2540,24.100,30.36,17.84
1,2010,2,8.68,1.3080,25.640,32.22,19.08
2,2010,3,8.00,2.8220,44.000,54.36,33.68
3,2010,4,9.44,3.4180,57.600,69.48,45.74
4,2010,5,8.40,5.9380,64.420,74.22,54.62
...,...,...,...,...,...,...,...
163,2023,8,7.12,3.1740,72.500,82.36,62.64
164,2023,9,6.34,1.5500,68.040,79.26,56.84
165,2023,10,8.32,3.3200,56.560,66.26,46.86
166,2023,11,8.86,0.8560,43.780,54.52,33.02


In [24]:
neighbor_data = average_values_for_nearest_stations_exclude(40.41236,-86.94739,'USW00014835')



In [25]:
merged_data = pd.merge(purdue_data, neighbor_data, on=['YEAR', 'MONTH'], how='inner')

In [26]:
merged_data

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,...,TMIN_lag_9_year,TMIN_lag_10_year,TMIN_lag_11_year,TMIN_lag_12_year,TMIN_lag_13_year,AWND_avg,PRCP_avg,TAVG_avg,TMAX_avg,TMIN_avg
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01-01,8.5,1.01,23.5,29.5,...,,,,,,9.48,1.2540,24.100,30.36,17.84
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02-01,7.6,0.61,26.0,32.8,...,,,,,,8.68,1.3080,25.640,32.22,19.08
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03-01,7.2,3.22,44.8,55.1,...,,,,,,8.00,2.8220,44.000,54.36,33.68
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04-01,8.1,2.49,58.1,70.4,...,,,,,,9.44,3.4180,57.600,69.48,45.74
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05-01,6.7,5.55,64.7,75.2,...,,,,,,8.40,5.9380,64.420,74.22,54.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-08-01,5.1,3.54,71.9,81.8,...,62.4,60.4,58.9,63.1,66.0,7.12,3.1740,72.500,82.36,62.64
164,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-09-01,4.9,0.59,67.5,79.8,...,51.2,54.8,51.8,55.1,55.0,6.34,1.5500,68.040,79.26,56.84
165,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-10-01,6.3,3.21,55.7,66.3,...,42.8,43.1,41.4,42.5,42.4,8.32,3.3200,56.560,66.26,46.86
166,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-11-01,6.5,0.48,42.6,53.8,...,26.6,28.6,29.3,39.6,32.0,8.86,0.8560,43.780,54.52,33.02


In [27]:
for var in ['AWND_avg', 'PRCP_avg', 'TAVG_avg', 'TMAX_avg', 'TMIN_avg']:
    for year in range(1, 14):
        year_lag = year*12
        merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)

  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
  merged_data[f'{var}_lag

In [28]:
merged_data

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,...,TMIN_avg_lag_4_year,TMIN_avg_lag_5_year,TMIN_avg_lag_6_year,TMIN_avg_lag_7_year,TMIN_avg_lag_8_year,TMIN_avg_lag_9_year,TMIN_avg_lag_10_year,TMIN_avg_lag_11_year,TMIN_avg_lag_12_year,TMIN_avg_lag_13_year
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01-01,8.5,1.01,23.5,29.5,...,,,,,,,,,,
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02-01,7.6,0.61,26.0,32.8,...,,,,,,,,,,
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03-01,7.2,3.22,44.8,55.1,...,,,,,,,,,,
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04-01,8.1,2.49,58.1,70.4,...,,,,,,,,,,
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05-01,6.7,5.55,64.7,75.2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-08-01,5.1,3.54,71.9,81.8,...,59.32,65.32,59.68,67.42,61.32,63.48,61.80,60.62,62.72,65.44
164,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-09-01,4.9,0.59,67.5,79.8,...,57.66,60.38,56.16,59.74,57.98,53.20,55.70,54.10,54.76,55.32
165,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-10-01,6.3,3.21,55.7,66.3,...,45.36,44.94,48.16,49.32,45.26,44.26,44.10,42.80,43.04,43.18
166,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2023-11-01,6.5,0.48,42.6,53.8,...,28.66,30.50,33.64,37.38,37.18,27.84,31.02,30.34,38.94,31.78


In [29]:
merged_data_final = merged_data.drop(columns=['AWND', 'TAVG', 'TMAX', 'TMIN', 'AWND_avg', 'PRCP_avg', 'TAVG_avg', 'TMAX_avg', 'TMIN_avg'])

### Lasso: top 5 feature selection

In [30]:
years = merged_data_final['YEAR'].unique()

In [31]:
years

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021, 2022, 2023], dtype=int32)

In [32]:
features_df = merged_data_final.drop(columns=['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'PRCP'])

In [33]:
features_df

Unnamed: 0,YEAR,MONTH,AWND_lag_1_year,AWND_lag_2_year,AWND_lag_3_year,AWND_lag_4_year,AWND_lag_5_year,AWND_lag_6_year,AWND_lag_7_year,AWND_lag_8_year,...,TMIN_avg_lag_4_year,TMIN_avg_lag_5_year,TMIN_avg_lag_6_year,TMIN_avg_lag_7_year,TMIN_avg_lag_8_year,TMIN_avg_lag_9_year,TMIN_avg_lag_10_year,TMIN_avg_lag_11_year,TMIN_avg_lag_12_year,TMIN_avg_lag_13_year
0,2010,1,,,,,,,,,...,,,,,,,,,,
1,2010,2,,,,,,,,,...,,,,,,,,,,
2,2010,3,,,,,,,,,...,,,,,,,,,,
3,2010,4,,,,,,,,,...,,,,,,,,,,
4,2010,5,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,2023,8,4.9,4.9,4.7,5.1,4.9,4.5,4.7,4.9,...,59.32,65.32,59.68,67.42,61.32,63.48,61.80,60.62,62.72,65.44
164,2023,9,5.1,6.0,5.8,4.9,5.8,4.9,5.4,5.4,...,57.66,60.38,56.16,59.74,57.98,53.20,55.70,54.10,54.76,55.32
165,2023,10,6.9,5.8,7.2,6.9,6.9,6.7,6.5,8.3,...,45.36,44.94,48.16,49.32,45.26,44.26,44.10,42.80,43.04,43.18
166,2023,11,7.6,6.7,7.8,6.9,7.8,7.4,6.3,8.1,...,28.66,30.50,33.64,37.38,37.18,27.84,31.02,30.34,38.94,31.78


In [34]:
train_df = merged_data_final[merged_data_final['YEAR'] == 2011]
test_df = merged_data_final[merged_data_final['YEAR'] == 2011 + 1]

In [35]:
pd.set_option('display.max_rows', None)


In [36]:
train_df.isnull().sum()

STATION                  0
NAME                     0
LATITUDE                 0
LONGITUDE                0
ELEVATION                0
DATE                     0
PRCP                     0
YEAR                     0
MONTH                    0
AWND_lag_1_year          0
AWND_lag_2_year         12
AWND_lag_3_year         12
AWND_lag_4_year         12
AWND_lag_5_year         12
AWND_lag_6_year         12
AWND_lag_7_year         12
AWND_lag_8_year         12
AWND_lag_9_year         12
AWND_lag_10_year        12
AWND_lag_11_year        12
AWND_lag_12_year        12
AWND_lag_13_year        12
PRCP_lag_1_year          0
PRCP_lag_2_year         12
PRCP_lag_3_year         12
PRCP_lag_4_year         12
PRCP_lag_5_year         12
PRCP_lag_6_year         12
PRCP_lag_7_year         12
PRCP_lag_8_year         12
PRCP_lag_9_year         12
PRCP_lag_10_year        12
PRCP_lag_11_year        12
PRCP_lag_12_year        12
PRCP_lag_13_year        12
TAVG_lag_1_year          0
TAVG_lag_2_year         12
T

In [37]:
train_df = train_df.dropna(axis=1, how='all')

In [38]:
train_df

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,PRCP,YEAR,MONTH,AWND_lag_1_year,PRCP_lag_1_year,TAVG_lag_1_year,TMAX_lag_1_year,TMIN_lag_1_year,AWND_avg_lag_1_year,PRCP_avg_lag_1_year,TAVG_avg_lag_1_year,TMAX_avg_lag_1_year,TMIN_avg_lag_1_year
12,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-01-01,0.73,2011,1,8.5,1.01,23.5,29.5,17.5,9.48,1.254,24.1,30.36,17.84
13,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-02-01,2.66,2011,2,7.6,0.61,26.0,32.8,19.1,8.68,1.308,25.64,32.22,19.08
14,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-03-01,2.15,2011,3,7.2,3.22,44.8,55.1,34.6,8.0,2.822,44.0,54.36,33.68
15,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-04-01,7.89,2011,4,8.1,2.49,58.1,70.4,45.8,9.44,3.418,57.6,69.48,45.74
16,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-05-01,1.32,2011,5,6.7,5.55,64.7,75.2,54.2,8.4,5.938,64.42,74.22,54.62
17,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-06-01,6.51,2011,6,5.6,9.61,74.6,84.3,64.8,7.64,8.366,74.12,83.52,64.74
18,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-07-01,3.41,2011,7,4.3,4.04,76.8,86.8,66.9,6.8,2.906,77.18,87.1,67.3
19,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-08-01,4.97,2011,8,3.8,2.23,76.8,87.7,66.0,5.9,1.334,76.34,87.26,65.44
20,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-09-01,3.83,2011,9,6.0,1.81,67.5,80.0,55.0,8.18,1.024,67.56,79.8,55.32
21,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2011-10-01,2.49,2011,10,6.3,0.97,56.4,70.4,42.4,8.36,1.248,56.18,69.2,43.18


In [39]:
features_df.columns

Index(['YEAR', 'MONTH', 'AWND_lag_1_year', 'AWND_lag_2_year',
       'AWND_lag_3_year', 'AWND_lag_4_year', 'AWND_lag_5_year',
       'AWND_lag_6_year', 'AWND_lag_7_year', 'AWND_lag_8_year',
       ...
       'TMIN_avg_lag_4_year', 'TMIN_avg_lag_5_year', 'TMIN_avg_lag_6_year',
       'TMIN_avg_lag_7_year', 'TMIN_avg_lag_8_year', 'TMIN_avg_lag_9_year',
       'TMIN_avg_lag_10_year', 'TMIN_avg_lag_11_year', 'TMIN_avg_lag_12_year',
       'TMIN_avg_lag_13_year'],
      dtype='object', length=132)

In [43]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import numpy as np

mse_values = {}
top_features_per_year = {} 

for year in years[1:-1]:  # Exclude the last year as it has no next year for testing
    # Splitting the dataset into current year (for training) and next year (for testing)
    train_df = merged_data_final[merged_data_final['YEAR'] == year]
    test_df = merged_data_final[merged_data_final['YEAR'] == year + 1]
    
    # Dropping rows with missing values in both train and test datasets
    train_df = train_df.dropna(axis=1, how='all')
    test_df = test_df.dropna(axis=1, how='all')
    
    features_df = train_df.drop(columns=['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'PRCP', 'YEAR'])
    
    # Extracting features and target for training and testing
    X_train = train_df[features_df.columns]
    y_train = train_df['PRCP']
    X_test = test_df[features_df.columns]
    y_test = test_df['PRCP']
    
    # Check if there's enough data to proceed
    if X_train.empty or X_test.empty:
        continue
    
    # Using LassoCV for feature selection and to fit the model
    lasso = LassoCV(cv=5).fit(X_train, y_train)
    
    # Predicting on the next year's data
    y_pred = lasso.predict(X_test)
    
    # Calculating MSE for the prediction
    mse = mean_squared_error(y_test, y_pred)
    
    # Storing the MSE value
    mse_values[year] = mse
    
    # Get the feature coefficients and their corresponding feature names
    feature_importance = np.abs(lasso.coef_)
    feature_names = X_train.columns
    features_coef = zip(feature_names, feature_importance)
    
    # Sort the features by the absolute value of their coefficient
    top_features = sorted(features_coef, key=lambda x: x[1], reverse=True)[:5]
    
    # Store the top features for the year
    top_features_per_year[year] = top_features

# Displaying the results
(top_features_per_year, mse_values)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

({2011: [('MONTH', 0.0),
   ('AWND_lag_1_year', 0.0),
   ('PRCP_lag_1_year', 0.0),
   ('TAVG_lag_1_year', 0.0),
   ('TMAX_lag_1_year', 0.0)],
  2012: [('TMAX_lag_2_year', 0.013655749030638698),
   ('MONTH', 0.0),
   ('AWND_lag_1_year', 0.0),
   ('AWND_lag_2_year', 0.0),
   ('PRCP_lag_1_year', 0.0)],
  2013: [('MONTH', 0.0),
   ('AWND_lag_1_year', 0.0),
   ('AWND_lag_2_year', 0.0),
   ('AWND_lag_3_year', 0.0),
   ('PRCP_lag_1_year', 0.0)],
  2014: [('TMAX_lag_1_year', 0.04181335401256917),
   ('TMAX_lag_4_year', 0.00942895929187475),
   ('MONTH', 0.0),
   ('AWND_lag_1_year', 0.0),
   ('AWND_lag_2_year', 0.0)],
  2015: [('PRCP_avg_lag_2_year', 0.7575897830589432),
   ('PRCP_lag_5_year', 0.3943092826465581),
   ('PRCP_lag_3_year', 0.30070920813789115),
   ('TAVG_lag_5_year', 0.18692508037282163),
   ('TMIN_avg_lag_3_year', 0.16913884236835708)],
  2016: [('TMAX_lag_4_year', 0.051297560155605534),
   ('MONTH', 0.0),
   ('AWND_lag_1_year', 0.0),
   ('AWND_lag_2_year', 0.0),
   ('AWND_lag_3_