## Missing data imputation 

In [1]:
import numpy as np
import pandas as pd 
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

### Load the processed data set with missing values and filter to extract data after year 1950

In [2]:
data = pd.read_csv("../data/processed/processed_data.csv")
filtered_data = data[data['year'] >= 1950]
filtered_data

Unnamed: 0,id,location,country,city,lat,long,alt,year,bloom_date,bloom_doy,tmax,tmin,prcp,agdd_winter,tmax_winter,tmin_winter,prcp_winter,co2_percapita,co2_emission
0,JAM00047918,Japan/Ishigakijima,Japan,Ishigakijima,24.336667,124.164444,5.7,1962,1962-02-20,51,,,,,,,,3.0680,292864430.0
1,JAM00047918,Japan/Ishigakijima,Japan,Ishigakijima,24.336667,124.164444,5.7,1963,1963-02-15,46,,,,,,,,3.3700,324835463.0
2,JAM00047918,Japan/Ishigakijima,Japan,Ishigakijima,24.336667,124.164444,5.7,1967,1967-02-13,44,,,,,,,,4.8532,489307430.0
3,JAM00047918,Japan/Ishigakijima,Japan,Ishigakijima,24.336667,124.164444,5.7,1971,1971-02-10,41,,,,,,,,7.4844,796544280.0
4,JAM00047918,Japan/Ishigakijima,Japan,Ishigakijima,24.336667,124.164444,5.7,1972,1972-02-20,51,,,,,,,,7.8922,852170216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7023,GME00127786,Switzerland/Liestal,Switzerland,Liestal,47.481400,7.730519,350.0,2017,2017-03-26,85,16.618082,6.177260,2.740822,235.55,5.344240,-1.678610,15.421659,4.5155,38182513.0
7024,GME00127786,Switzerland/Liestal,Switzerland,Liestal,47.481400,7.730519,350.0,2018,2018-04-08,98,17.635616,7.103836,2.196438,337.85,6.214555,0.563710,27.304147,4.3243,36867631.0
7025,GME00127786,Switzerland/Liestal,Switzerland,Liestal,47.481400,7.730519,350.0,2019,2019-03-27,86,17.057534,6.315890,2.327945,374.60,7.906413,0.278994,20.287634,4.2766,36741865.0
7026,GME00127786,Switzerland/Liestal,Switzerland,Liestal,47.481400,7.730519,350.0,2020,2020-03-17,77,17.612842,6.630055,2.223224,475.65,9.360289,1.146607,21.002225,3.7319,32298333.0


### Set up the KNNimputer and select columns used for imputations

In [3]:
imputer = KNNImputer(n_neighbors=10, weights='distance', metric='nan_euclidean')

In [4]:
for_imputation = [
    "lat",
    "long",
    "alt",
    "tmax",
    "tmin",
    "prcp",
    "agdd_winter",
    "tmax_winter",
    "tmin_winter",
    "prcp_winter",
    "co2_percapita",
]

others = ["country", "city", "year", "bloom_doy"]

In [5]:
for_imputation_data = filtered_data[for_imputation]
for_imputation_data

Unnamed: 0,lat,long,alt,tmax,tmin,prcp,agdd_winter,tmax_winter,tmin_winter,prcp_winter,co2_percapita
0,24.336667,124.164444,5.7,,,,,,,,3.0680
1,24.336667,124.164444,5.7,,,,,,,,3.3700
2,24.336667,124.164444,5.7,,,,,,,,4.8532
3,24.336667,124.164444,5.7,,,,,,,,7.4844
4,24.336667,124.164444,5.7,,,,,,,,7.8922
...,...,...,...,...,...,...,...,...,...,...,...
7023,47.481400,7.730519,350.0,16.618082,6.177260,2.740822,235.55,5.344240,-1.678610,15.421659,4.5155
7024,47.481400,7.730519,350.0,17.635616,7.103836,2.196438,337.85,6.214555,0.563710,27.304147,4.3243
7025,47.481400,7.730519,350.0,17.057534,6.315890,2.327945,374.60,7.906413,0.278994,20.287634,4.2766
7026,47.481400,7.730519,350.0,17.612842,6.630055,2.223224,475.65,9.360289,1.146607,21.002225,3.7319


In [6]:
impute_df = pd.DataFrame(imputer.fit_transform(for_imputation_data),columns = for_imputation_data.columns).reset_index()
impute_df

Unnamed: 0,index,lat,long,alt,tmax,tmin,prcp,agdd_winter,tmax_winter,tmin_winter,prcp_winter,co2_percapita
0,0,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,3.068000
1,1,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,3.370000
2,2,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,4.853200
3,3,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,7.484400
4,4,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,7.892200
...,...,...,...,...,...,...,...,...,...,...,...,...
6177,6177,47.481400,7.730519,350.0,16.618082,6.177260,2.740822,235.55,5.344240,-1.678610,15.421659,4.515500
6178,6178,47.481400,7.730519,350.0,17.635616,7.103836,2.196438,337.85,6.214555,0.563710,27.304147,4.324300
6179,6179,47.481400,7.730519,350.0,17.057534,6.315890,2.327945,374.60,7.906413,0.278994,20.287634,4.276600
6180,6180,47.481400,7.730519,350.0,17.612842,6.630055,2.223224,475.65,9.360289,1.146607,21.002225,3.731900


In [7]:
others_df = filtered_data[others].reset_index()
others_df

Unnamed: 0,index,country,city,year,bloom_doy
0,0,Japan,Ishigakijima,1962,51
1,1,Japan,Ishigakijima,1963,46
2,2,Japan,Ishigakijima,1967,44
3,3,Japan,Ishigakijima,1971,41
4,4,Japan,Ishigakijima,1972,51
...,...,...,...,...,...
6177,7023,Switzerland,Liestal,2017,85
6178,7024,Switzerland,Liestal,2018,98
6179,7025,Switzerland,Liestal,2019,86
6180,7026,Switzerland,Liestal,2020,77


### Put together the final data set and export to csv

In [8]:
final_df = pd.concat([others_df, impute_df], axis = 1)
final_df = final_df.drop(columns = ['index'])

In [9]:
final_df

Unnamed: 0,country,city,year,bloom_doy,lat,long,alt,tmax,tmin,prcp,agdd_winter,tmax_winter,tmin_winter,prcp_winter,co2_percapita
0,Japan,Ishigakijima,1962,51,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,3.068000
1,Japan,Ishigakijima,1963,46,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,3.370000
2,Japan,Ishigakijima,1967,44,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,4.853200
3,Japan,Ishigakijima,1971,41,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,7.484400
4,Japan,Ishigakijima,1972,51,24.336667,124.164444,5.7,23.800000,21.050400,21.970455,22.35,17.800000,15.018750,98.400000,7.892200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6177,Switzerland,Liestal,2017,85,47.481400,7.730519,350.0,16.618082,6.177260,2.740822,235.55,5.344240,-1.678610,15.421659,4.515500
6178,Switzerland,Liestal,2018,98,47.481400,7.730519,350.0,17.635616,7.103836,2.196438,337.85,6.214555,0.563710,27.304147,4.324300
6179,Switzerland,Liestal,2019,86,47.481400,7.730519,350.0,17.057534,6.315890,2.327945,374.60,7.906413,0.278994,20.287634,4.276600
6180,Switzerland,Liestal,2020,77,47.481400,7.730519,350.0,17.612842,6.630055,2.223224,475.65,9.360289,1.146607,21.002225,3.731900


In [10]:
final_df.to_csv('./processed/clean_data.csv', index=False)

### Sanity Check

In [11]:
final_df.isna().sum()

country          0
city             0
year             0
bloom_doy        0
lat              0
long             0
alt              0
tmax             0
tmin             0
prcp             0
agdd_winter      0
tmax_winter      0
tmin_winter      0
prcp_winter      0
co2_percapita    0
dtype: int64