In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [80]:
train_input = pd.read_csv("dengue_features_train.csv")
train_output = pd.read_csv("dengue_labels_train.csv")
test_input = pd.read_csv("dengue_features_test.csv")

In [81]:
df_train = pd.merge(left=train_input,right=train_output,how="inner",on=["city","year","weekofyear"])

In [82]:
column_order = ["city",
                "year", "weekofyear",
"week_start_date",

"station_max_temp_c",
"station_min_temp_c",
"station_avg_temp_c",
"station_precip_mm",
"station_diur_temp_rng_c",

"precipitation_amt_mm",
"reanalysis_sat_precip_amt_mm",
"reanalysis_dew_point_temp_k",
"reanalysis_air_temp_k",
"reanalysis_relative_humidity_percent",
"reanalysis_specific_humidity_g_per_kg",
"reanalysis_precip_amt_kg_per_m2",
"reanalysis_max_air_temp_k",
"reanalysis_min_air_temp_k",
"reanalysis_avg_temp_k",
"reanalysis_tdtr_k",

"ndvi_se",
"ndvi_sw",
"ndvi_ne","ndvi_nw"]

In [83]:
df_train = df_train[column_order]
df_train.drop(columns=["station_avg_temp_c","reanalysis_avg_temp_k","reanalysis_air_temp_k","reanalysis_sat_precip_amt_mm"],inplace=True)
df_train["week_start_date"] = pd.to_datetime(df_train["week_start_date"]).dt.month
df_train.rename(columns={"week_start_date":"month"},inplace=True)

In [84]:
def deal_with_null_values(df_train,cols):
    for col in cols:
        null_data = df_train[df_train[col].isna()][["year","month"]].value_counts().reset_index()[["year","month"]].sort_values(by=["year","month"]).reset_index(drop=True)
        for i in null_data.index.values:
            year = null_data.loc[i,"year"]
            month = null_data.loc[i,"month"]
            rep = df_train[col][df_train["year"]==year][df_train["month"]==month].mean().round(1)
            df_train[col][(df_train["year"]==year) & (df_train["month"]==month)] = df_train[col][(df_train["year"]==year) & (df_train["month"]==month)].fillna(rep)
    return df_train

In [85]:
null_columns_df = df_train.isna().sum().reset_index()
null_columns_all = null_columns_df[null_columns_df[0]>0]["index"].values
null_columns = null_columns_all[[i[0:4]!="ndvi" for i in null_columns_all]]

In [89]:
null_columns

array(['station_max_temp_c', 'station_min_temp_c', 'station_precip_mm',
       'station_diur_temp_rng_c', 'precipitation_amt_mm',
       'reanalysis_dew_point_temp_k',
       'reanalysis_relative_humidity_percent',
       'reanalysis_specific_humidity_g_per_kg',
       'reanalysis_precip_amt_kg_per_m2', 'reanalysis_max_air_temp_k',
       'reanalysis_min_air_temp_k', 'reanalysis_tdtr_k'], dtype=object)

In [86]:
df_train = deal_with_null_values(df_train,null_columns)

In [88]:
df_train

Unnamed: 0,city,year,weekofyear,month,station_max_temp_c,station_min_temp_c,station_precip_mm,station_diur_temp_rng_c,precipitation_amt_mm,reanalysis_dew_point_temp_k,reanalysis_relative_humidity_percent,reanalysis_specific_humidity_g_per_kg,reanalysis_precip_amt_kg_per_m2,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_tdtr_k,ndvi_se,ndvi_sw,ndvi_ne,ndvi_nw
0,sj,1990,18,4,29.4,20.0,16.0,6.900000,12.42,292.414286,73.365714,14.012857,32.00,299.8,295.9,2.628571,0.198483,0.177617,0.122600,0.103725
1,sj,1990,19,5,31.7,22.2,8.6,6.371429,22.82,293.951429,77.368571,15.372857,17.94,300.9,296.4,2.371429,0.162357,0.155486,0.169900,0.142175
2,sj,1990,20,5,32.2,22.8,41.4,6.485714,34.54,295.434286,82.052857,16.848571,26.10,300.5,297.3,2.300000,0.157200,0.170843,0.032250,0.172967
3,sj,1990,21,5,33.3,23.3,4.0,6.771429,15.36,295.310000,80.337143,16.672857,13.90,301.4,297.0,2.428571,0.227557,0.235886,0.128633,0.245067
4,sj,1990,22,5,35.0,23.9,5.8,9.371429,7.52,295.821429,80.460000,17.210000,12.20,301.9,297.5,3.014286,0.251200,0.247340,0.196200,0.262200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,iq,2010,21,5,35.4,22.4,27.0,11.933333,55.30,296.825714,88.765714,18.485714,45.00,309.7,294.5,9.800000,0.256343,0.292514,0.342750,0.318900
1452,iq,2010,22,6,34.7,21.7,36.6,10.500000,86.47,296.452857,91.600000,18.070000,207.10,308.5,291.9,7.471429,0.136043,0.225657,0.160157,0.160371
1453,iq,2010,23,6,32.2,19.2,7.4,6.900000,58.94,295.501429,94.280000,17.008571,50.60,305.5,292.4,7.500000,0.250357,0.233714,0.247057,0.146057
1454,iq,2010,24,6,31.2,21.0,16.0,8.733333,59.67,295.324286,94.660000,16.815714,62.33,306.1,291.9,7.871429,0.278886,0.325486,0.333914,0.245771
