In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('/Users/yunlei/Desktop/MGMT 478/Combined dataset_nonsort.csv') # change the path

In [4]:
data.isnull().sum()

STATION         0
NAME            0
LATITUDE        0
LONGITUDE       0
ELEVATION       0
DATE            0
AWND          700
PRCP            8
SNOW         1713
TAVG           27
TMAX           10
TMIN           25
dtype: int64

In [5]:
## data prepossessing
# Convert DATE column to datetime format
data['DATE'] = pd.to_datetime(data['DATE'])
# Extract year and month from DATE as new features
data['YEAR'] = data['DATE'].dt.year
data['MONTH'] = data['DATE'].dt.month
# Drop the 'SNOW' column
data_cleaned = data.drop(['SNOW'], axis=1)
# Convert non-numeric to numeric
for column in ['LATITUDE','LONGITUDE','ELEVATION','AWND', 'TAVG', 'TMAX', 'TMIN']:
    data_cleaned[column] = pd.to_numeric(data_cleaned[column], errors='coerce')

In [6]:
# Imputer missing data as median of the column
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']] = imputer.fit_transform(data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']])

In [7]:
data_cleaned.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN,YEAR,MONTH
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01-01,8.5,1.01,23.5,29.5,17.5,2010,1
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02-01,7.6,0.61,26.0,32.8,19.1,2010,2
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03-01,7.2,3.22,44.8,55.1,34.6,2010,3
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04-01,8.1,2.49,58.1,70.4,45.8,2010,4
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05-01,6.7,5.55,64.7,75.2,54.2,2010,5


In [8]:
data_cleaned.isnull().sum()

STATION      0
NAME         0
LATITUDE     0
LONGITUDE    0
ELEVATION    0
DATE         0
AWND         0
PRCP         8
TAVG         0
TMAX         0
TMIN         0
YEAR         0
MONTH        0
dtype: int64

<font color='red'>Note: I do some basic data clean, which imputer missing data as median of the column.</font> 

In [9]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

stations = data_cleaned[['STATION', 'NAME', 'LATITUDE', 'LONGITUDE']].drop_duplicates()
coordinates = stations[['LATITUDE', 'LONGITUDE']]
neighbors_model = NearestNeighbors(n_neighbors=6)
neighbors_model.fit(coordinates)

def six_nearest_weather_stations(latitude, longitude):
    query_coordinates = np.array([[latitude, longitude]])
    distances, indices = neighbors_model.kneighbors(query_coordinates)
    nearest_stations_info = stations.iloc[indices[0]].copy() 
    nearest_stations_info['DISTANCE(°)'] = distances[0]

    return nearest_stations_info

In [10]:
def average_values_for_nearest_stations_exclude(latitude, longitude, station_to_exclude):
    nearest_stations_info = six_nearest_weather_stations(latitude, longitude)
    nearest_station_ids = nearest_stations_info['STATION'].tolist()
    
    # Remove the specific station ID from the list
    if station_to_exclude in nearest_station_ids:
        nearest_station_ids.remove(station_to_exclude)
    
    filtered_data = data_cleaned[data_cleaned['STATION'].isin(nearest_station_ids)]
    average_values = filtered_data.groupby(['YEAR', 'MONTH'])[['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']].mean().reset_index()
    average_values.rename(columns={
        'AWND': 'AWND_avg',
        'PRCP': 'PRCP_avg',
        'TAVG': 'TAVG_avg',
        'TMAX': 'TMAX_avg',
        'TMIN': 'TMIN_avg'
    }, inplace=True)
    return average_values

In [12]:
def get_analysis_data(latitude, longitude, weather_station):
    weather_station_data = data_cleaned[data_cleaned['STATION']== weather_station]
    
    for var in ['AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN']:
        for year in range(1, 6):
            year_lag = year*12
            weather_station_data[f'{var}_lag_{year}_year'] = weather_station_data[var].shift(year_lag)
            
    neighbor_data = average_values_for_nearest_stations_exclude(latitude, longitude, weather_station)
    merged_data = pd.merge(weather_station_data, neighbor_data, on=['YEAR', 'MONTH'], how='inner')
    
    for var in ['AWND_avg', 'PRCP_avg', 'TAVG_avg', 'TMAX_avg', 'TMIN_avg']:
        for year in range(1, 6):
            year_lag = year*12
            merged_data[f'{var}_lag_{year}_year'] = merged_data[var].shift(year_lag)
    
    merged_data_final = merged_data.drop(columns=['AWND', 'TAVG', 'TMAX', 'TMIN', 'AWND_avg', 'PRCP_avg', 'TAVG_avg', 'TMAX_avg', 'TMIN_avg'])
    merged_data_final = merged_data_final.dropna()
    return merged_data_final

In [None]:
def model_analysis(latitude, longitude, weather_station):
    '''
    a specific weather station and its latitude and longitude as input
    this part of code is incomplete!!!
    '''
    
    merged_data_final = get_analysis_data(latitude, longitude, weather_station)

    # Set the starting and ending years for the time window
    years = merged_data_final['YEAR'].unique()
    start_year = years[0] + 4 #starting year 2015
    end_year = 2022

    # Loop through each time window
    for year in range(start_year, end_year + 1):
        # Define the training and testing sets
        train_df = merged_data_final[merged_data_final['YEAR'].between(year - 4, year)]
        test_df = merged_data_final[merged_data_final['YEAR'] == year + 1]

        # Remove rows with missing values
        train_df = train_df.dropna()
        test_df = test_df.dropna()
        
###following is the code for model analysis

### The way that I split the train set data is that 
The training set (train_df) contains data from the first 4 years (including the current year) of the current cycle year. This means that if the current cycle year is 2019, then the training set will include data from 2015 to 2019 (both 2015 and 2019).

The test set (test_df), on the other hand, contains data from the year immediately following the current loop year. Continuing with the example above, if the current loop year is 2019, then the test set will contain data from 2020.