# KNN model

## Loading data

Step before loading date: Using excel first combined all the data in one sheet, and the sorting by the "DATE".

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('/Users/yunlei/Desktop/MGMT 478/Combined dataset.csv')
data.head(20)

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,TAVG,TMAX,TMIN
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01,8.5,1.01,,23.5,29.5,17.5
1,USW00014848,"SOUTH BEND AIRPORT, IN US",41.70722,-86.31628,235.4,2010-01,9.4,1.22,27.4,23.0,29.0,16.9
2,USC00120784,"BLOOMINGTON INDIANA UNIVERSITY, IN US",39.17399,-86.52076,253.0,2010-01,,1.89,7.0,24.9,31.5,18.2
3,USW00014827,"FORT WAYNE INTERNATIONAL AIRPORT, IN US",40.97248,-85.20636,243.0,2010-01,10.7,0.64,7.7,23.0,28.9,17.2
4,USW00093819,"INDIANAPOLIS INTERNATIONAL AIRPORT, IN US",39.72515,-86.2816,241.3,2010-01,10.1,1.22,7.8,24.6,31.0,18.2
5,USW00053866,"SHELBYVILLE MUNICIPAL AIRPORT, IN US",39.58545,-85.79982,244.4,2010-01,9.4,1.3,,25.0,31.4,18.7
6,USW00093810,"CARBONDALE SOUTHERN ILLINOIS AIRPORT, IL US",37.78329,-89.24533,122.1,2010-01,7.6,1.33,,29.1,37.5,20.8
7,USW00094892,"CHICAGO WEST CHICAGO DUPAGE AIRPORT, IL US",41.89641,-88.25119,228.3,2010-01,9.4,0.69,,19.9,26.5,13.3
8,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,204.8,2010-01,9.8,1.13,9.1,21.9,27.4,16.4
9,USC00114355,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,2010-01,,2.0,7.6,,23.9,


In [3]:
# Check for missing values
data.isnull().sum()

STATION         0
NAME            0
LATITUDE        0
LONGITUDE       0
ELEVATION       0
DATE            0
AWND          700
PRCP            8
SNOW         1713
TAVG           27
TMAX           10
TMIN           25
dtype: int64

## Cleaning data

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
# Drop rows where 'PRCP' is missing
data_cleaned = data.dropna(subset=['PRCP'])

In [6]:
# Drop the 'SNOW' column
data_cleaned = data_cleaned.drop(['SNOW'], axis=1)

In [7]:
data_cleaned.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01,8.5,1.01,23.5,29.5,17.5
1,USW00014848,"SOUTH BEND AIRPORT, IN US",41.70722,-86.31628,235.4,2010-01,9.4,1.22,23.0,29.0,16.9
2,USC00120784,"BLOOMINGTON INDIANA UNIVERSITY, IN US",39.17399,-86.52076,253.0,2010-01,,1.89,24.9,31.5,18.2
3,USW00014827,"FORT WAYNE INTERNATIONAL AIRPORT, IN US",40.97248,-85.20636,243.0,2010-01,10.7,0.64,23.0,28.9,17.2
4,USW00093819,"INDIANAPOLIS INTERNATIONAL AIRPORT, IN US",39.72515,-86.2816,241.3,2010-01,10.1,1.22,24.6,31.0,18.2


In [8]:
# Convert non-numeric to numeric
for column in ['AWND', 'TAVG', 'TMAX', 'TMIN']:
    data_cleaned[column] = pd.to_numeric(data_cleaned[column], errors='coerce')

In [9]:
imputer = SimpleImputer(strategy='median')
data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']] = imputer.fit_transform(data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']])

In [10]:
data_cleaned.head(20)

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,TAVG,TMAX,TMIN
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01,8.5,1.01,23.5,29.5,17.5
1,USW00014848,"SOUTH BEND AIRPORT, IN US",41.70722,-86.31628,235.4,2010-01,9.4,1.22,23.0,29.0,16.9
2,USC00120784,"BLOOMINGTON INDIANA UNIVERSITY, IN US",39.17399,-86.52076,253.0,2010-01,7.8,1.89,24.9,31.5,18.2
3,USW00014827,"FORT WAYNE INTERNATIONAL AIRPORT, IN US",40.97248,-85.20636,243.0,2010-01,10.7,0.64,23.0,28.9,17.2
4,USW00093819,"INDIANAPOLIS INTERNATIONAL AIRPORT, IN US",39.72515,-86.2816,241.3,2010-01,10.1,1.22,24.6,31.0,18.2
5,USW00053866,"SHELBYVILLE MUNICIPAL AIRPORT, IN US",39.58545,-85.79982,244.4,2010-01,9.4,1.3,25.0,31.4,18.7
6,USW00093810,"CARBONDALE SOUTHERN ILLINOIS AIRPORT, IL US",37.78329,-89.24533,122.1,2010-01,7.6,1.33,29.1,37.5,20.8
7,USW00094892,"CHICAGO WEST CHICAGO DUPAGE AIRPORT, IL US",41.89641,-88.25119,228.3,2010-01,9.4,0.69,19.9,26.5,13.3
8,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",41.96017,-87.93164,204.8,2010-01,9.8,1.13,21.9,27.4,16.4
9,USC00114355,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,2010-01,7.8,2.0,57.5,23.9,47.55


In [11]:
# Convert 'DATE' to datetime and extract year and month
data_cleaned['DATE'] = pd.to_datetime(data_cleaned['DATE'])
data_cleaned['YEAR'] = data_cleaned['DATE'].dt.year
data_cleaned['MONTH'] = data_cleaned['DATE'].dt.month

In [12]:
# Drop the original 'DATE', 'STATION', and 'NAME' columns as they are not needed for prediction
data_final = data_cleaned.drop(['DATE', 'STATION', 'NAME'], axis=1)

In [13]:
data_final.head()

Unnamed: 0,LATITUDE,LONGITUDE,ELEVATION,AWND,PRCP,TAVG,TMAX,TMIN,YEAR,MONTH
0,40.41236,-86.94739,181.7,8.5,1.01,23.5,29.5,17.5,2010,1
1,41.70722,-86.31628,235.4,9.4,1.22,23.0,29.0,16.9,2010,1
2,39.17399,-86.52076,253.0,7.8,1.89,24.9,31.5,18.2,2010,1
3,40.97248,-85.20636,243.0,10.7,0.64,23.0,28.9,17.2,2010,1
4,39.72515,-86.2816,241.3,10.1,1.22,24.6,31.0,18.2,2010,1


## Data Partition

In [14]:
# Determine the last year in the dataset
last_year = data_final['YEAR'].max()
last_year

2024

In [15]:
train_data = data_final[data_final['YEAR'] < last_year]
test_data = data_final[data_final['YEAR'] == last_year]

In [16]:
# Separate features and target variable
X_train = train_data.drop('PRCP', axis=1)
y_train = train_data['PRCP']
X_test = test_data.drop('PRCP', axis=1)
y_test = test_data['PRCP']

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [18]:
# Initialize a scaler
scaler = StandardScaler()

In [19]:
# Fit on training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Initialize KNN regressor with default k=5
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_train_pred = knn.predict(X_train_scaled)
y_test_pred = knn.predict(X_test_scaled)

In [21]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_train

1.8918587281733745

In [22]:
mse_test = mean_squared_error(y_test, y_test_pred)
mse_test

2.9243693333333334

In [23]:
prcp_min = data_final['PRCP'].min()
prcp_max = data_final['PRCP'].max()
prcp_range = prcp_max - prcp_min
prcp_min, prcp_max, prcp_range

(0.0, 13.5, 13.5)

The average squared difference between the predicted and actual precipitation values of the model is 2.924369.

The problem for this model is that it needs input data to predict.

## KNN model ( Using Lag precipitation)

In [24]:
data = pd.read_csv('/Users/yunlei/Desktop/MGMT 478/Combined dataset_nonsort.csv')
data.head(20)

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,TAVG,TMAX,TMIN
0,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-01,8.5,1.01,,23.5,29.5,17.5
1,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-02,7.6,0.61,,26.0,32.8,19.1
2,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-03,7.2,3.22,,44.8,55.1,34.6
3,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-04,8.1,2.49,,58.1,70.4,45.8
4,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-05,6.7,5.55,,64.7,75.2,54.2
5,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-06,5.6,9.61,,74.6,84.3,64.8
6,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-07,4.3,4.04,,76.8,86.8,66.9
7,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-08,3.8,2.23,,76.8,87.7,66.0
8,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-09,6.0,1.81,,67.5,80.0,55.0
9,USW00014835,"LAFAYETTE PURDUE UNIVERSITY AIRPORT, IN US",40.41236,-86.94739,181.7,2010-10,6.3,0.97,,56.4,70.4,42.4


In [25]:
data['DATE'] = pd.to_datetime(data['DATE'])
data.sort_values(['STATION', 'DATE'], inplace=True)

In [26]:
data.set_index(['STATION', 'DATE'], inplace=True)

In [27]:
for lag in [1, 3, 7]:
    data[f'PRCP_Lag{lag}'] = data.groupby(level='STATION')['PRCP'].shift(lag)

In [28]:
data.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,NAME,LATITUDE,LONGITUDE,ELEVATION,AWND,PRCP,SNOW,TAVG,TMAX,TMIN,PRCP_Lag1,PRCP_Lag3,PRCP_Lag7
STATION,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
USC00114355,2010-01-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,2.0,7.6,,23.9,,,,
USC00114355,2010-02-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,0.8,12.8,,31.1,,2.0,,
USC00114355,2010-03-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,2.5,1.4,,52.8,,0.8,,
USC00114355,2010-04-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,3.34,0.0,,71.4,,2.5,2.0,
USC00114355,2010-05-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,6.63,0.0,,73.0,,3.34,0.8,
USC00114355,2010-06-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,9.96,0.0,,83.6,,6.63,2.5,
USC00114355,2010-08-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,5.5,0.0,,86.0,,9.96,3.34,
USC00114355,2010-09-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,4.76,0.0,,76.6,,5.5,6.63,2.0
USC00114355,2010-10-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.67,0.0,,69.9,,4.76,9.96,0.8
USC00114355,2010-11-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.55,0.0,,52.8,,1.67,5.5,2.5


In [46]:
data.dropna(subset=['PRCP'] + [f'PRCP_Lag{lag}' for lag in [1, 3, 7]], inplace=True)

In [47]:
data.head(20)

Unnamed: 0,STATION,DATE,NAME,LATITUDE,LONGITUDE,ELEVATION,AWND,PRCP,SNOW,TAVG,TMAX,TMIN,PRCP_Lag1,PRCP_Lag3,PRCP_Lag7
0,USC00114355,2010-09-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,4.76,0.0,,76.6,,5.5,6.63,2.0
1,USC00114355,2010-10-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.67,0.0,,69.9,,4.76,9.96,0.8
2,USC00114355,2010-11-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.55,0.0,,52.8,,1.67,5.5,2.5
3,USC00114355,2010-12-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.55,11.0,,30.4,,1.55,4.76,3.34
4,USC00114355,2011-01-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,0.89,,,26.6,,1.55,1.67,6.63
5,USC00114355,2011-02-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,2.67,,,35.4,,0.89,1.55,9.96
6,USC00114355,2011-03-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,2.07,,,48.4,,2.67,1.55,5.5
7,USC00114355,2011-04-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,3.64,,,58.7,,2.07,0.89,4.76
8,USC00114355,2011-05-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,3.51,,,73.4,,3.64,2.67,1.67
9,USC00114355,2011-06-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,4.71,,71.7,80.9,62.6,3.51,2.07,1.55


In [48]:
data.reset_index(inplace=True)

In [49]:
data.head()

Unnamed: 0,index,STATION,DATE,NAME,LATITUDE,LONGITUDE,ELEVATION,AWND,PRCP,SNOW,TAVG,TMAX,TMIN,PRCP_Lag1,PRCP_Lag3,PRCP_Lag7
0,0,USC00114355,2010-09-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,4.76,0.0,,76.6,,5.5,6.63,2.0
1,1,USC00114355,2010-10-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.67,0.0,,69.9,,4.76,9.96,0.8
2,2,USC00114355,2010-11-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.55,0.0,,52.8,,1.67,5.5,2.5
3,3,USC00114355,2010-12-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,1.55,11.0,,30.4,,1.55,4.76,3.34
4,4,USC00114355,2011-01-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,,0.89,,,26.6,,1.55,1.67,6.63


In [50]:
# Check for missing values
data.isnull().sum()

index           0
STATION         0
DATE            0
NAME            0
LATITUDE        0
LONGITUDE       0
ELEVATION       0
AWND          649
PRCP            0
SNOW         1628
TAVG           16
TMAX            6
TMIN           14
PRCP_Lag1       0
PRCP_Lag3       0
PRCP_Lag7       0
dtype: int64

In [51]:
# Drop the 'SNOW' column
data_cleaned = data.drop(['SNOW'], axis=1)

In [52]:
# Convert non-numeric to numeric
for column in ['AWND', 'TAVG', 'TMAX', 'TMIN']:
    data_cleaned[column] = pd.to_numeric(data_cleaned[column], errors='coerce')

In [53]:
imputer = SimpleImputer(strategy='median')
data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']] = imputer.fit_transform(data_cleaned[['AWND', 'TAVG', 'TMAX', 'TMIN']])

In [54]:
data_cleaned.head(20)

Unnamed: 0,index,STATION,DATE,NAME,LATITUDE,LONGITUDE,ELEVATION,AWND,PRCP,TAVG,TMAX,TMIN,PRCP_Lag1,PRCP_Lag3,PRCP_Lag7
0,0,USC00114355,2010-09-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,4.76,57.5,76.6,47.6,5.5,6.63,2.0
1,1,USC00114355,2010-10-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,1.67,57.5,69.9,47.6,4.76,9.96,0.8
2,2,USC00114355,2010-11-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,1.55,57.5,52.8,47.6,1.67,5.5,2.5
3,3,USC00114355,2010-12-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,1.55,57.5,30.4,47.6,1.55,4.76,3.34
4,4,USC00114355,2011-01-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,0.89,57.5,26.6,47.6,1.55,1.67,6.63
5,5,USC00114355,2011-02-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,2.67,57.5,35.4,47.6,0.89,1.55,9.96
6,6,USC00114355,2011-03-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,2.07,57.5,48.4,47.6,2.67,1.55,5.5
7,7,USC00114355,2011-04-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,3.64,57.5,58.7,47.6,2.07,0.89,4.76
8,8,USC00114355,2011-05-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,3.51,57.5,73.4,47.6,3.64,2.67,1.67
9,9,USC00114355,2011-06-01,"ILLINOIS CITY DAM 16, IL US",41.4255,-91.0094,167.6,7.8,4.71,71.7,80.9,62.6,3.51,2.07,1.55


In [55]:
data_cleaned['YEAR'] = data_cleaned['DATE'].dt.year
train_data = data_cleaned[data_cleaned['YEAR'] < data_cleaned['YEAR'].max()]
test_data = data_cleaned[data_cleaned['YEAR'] == data_cleaned['YEAR'].max()]

In [56]:
X_train = train_data[[f'PRCP_Lag{lag}' for lag in [1, 3, 7]]]
X_train.head()

Unnamed: 0,PRCP_Lag1,PRCP_Lag3,PRCP_Lag7
0,5.5,6.63,2.0
1,4.76,9.96,0.8
2,1.67,5.5,2.5
3,1.55,4.76,3.34
4,1.55,1.67,6.63


In [57]:
y_train = train_data['PRCP']
X_test = test_data[[f'PRCP_Lag{lag}' for lag in [1, 3, 7]]]
y_test = test_data['PRCP']

In [58]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

In [59]:
y_train.isnull().any().any()

False

In [64]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_test_pred = knn.predict(X_test_scaled)
y_test_pred[:5]
y_train_pred = knn.predict(X_train_scaled)

In [65]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_train

2.845744840731071

In [63]:
mse_test = mean_squared_error(y_test, y_test_pred)
mse_test 

2.1179003333333335