In [197]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import *
from sklearn.svm import *
from sklearn.metrics import *
from sklearn.naive_bayes import *
from sklearn.linear_model import *

warnings.filterwarnings("ignore")

In [198]:
train_input = pd.read_csv("dengue_features_train.csv")
train_output = pd.read_csv("dengue_labels_train.csv")
test_input = pd.read_csv("dengue_features_test.csv")

In [199]:
df_train = pd.merge(left=train_input,right=train_output,how="inner",on=["city","year","weekofyear"])
df_test = test_input.copy()

In [200]:
column_order = ["city","year","weekofyear","week_start_date","station_max_temp_c","station_min_temp_c","station_avg_temp_c","station_precip_mm","station_diur_temp_rng_c","precipitation_amt_mm","reanalysis_sat_precip_amt_mm","reanalysis_dew_point_temp_k","reanalysis_air_temp_k","reanalysis_relative_humidity_percent","reanalysis_specific_humidity_g_per_kg","reanalysis_precip_amt_kg_per_m2","reanalysis_max_air_temp_k","reanalysis_min_air_temp_k","reanalysis_avg_temp_k","reanalysis_tdtr_k","ndvi_se","ndvi_sw","ndvi_ne","ndvi_nw","total_cases"]

In [201]:
df_train_1 = df_train[column_order]
df_train_1.drop(columns=["station_avg_temp_c","reanalysis_avg_temp_k","reanalysis_air_temp_k","reanalysis_sat_precip_amt_mm"],inplace=True)
df_train_1["week_start_date"] = pd.to_datetime(df_train["week_start_date"]).dt.month
df_train_1.rename(columns={"week_start_date":"month"},inplace=True)

df_test_1 = df_test[column_order[:len(column_order)-1]]
df_test_1.drop(columns=["station_avg_temp_c","reanalysis_avg_temp_k","reanalysis_air_temp_k","reanalysis_sat_precip_amt_mm"],inplace=True)
df_test_1["week_start_date"] = pd.to_datetime(df_test["week_start_date"]).dt.month
df_test_1.rename(columns={"week_start_date":"month"},inplace=True)

In [202]:
def deal_with_null_values(df,cols):
    for col in cols:
        null_data = df[df[col].isna()][["year","month"]].value_counts().reset_index()[["year","month"]].sort_values(by=["year","month"]).reset_index(drop=True)
        for i in null_data.index.values:
            year = null_data.loc[i,"year"]
            month = null_data.loc[i,"month"]
            rep = df[col][df["year"]==year][df["month"]==month].mean()
            df[col][(df["year"]==year) & (df["month"]==month)] = df[col][(df["year"]==year) & (df["month"]==month)].fillna(rep)
    return df

In [203]:
null_columns_df = df_train_1.isna().sum().reset_index()
null_columns_all_train = null_columns_df[null_columns_df[0]>0]["index"].values
null_columns_train = null_columns_all_train[[i[0:4]!="ndvi" for i in null_columns_all_train]]

null_columns_df = df_test_1.isna().sum().reset_index()
null_columns_all_test = null_columns_df[null_columns_df[0]>0]["index"].values
null_columns_test = null_columns_all_test[[i[0:4]!="ndvi" for i in null_columns_all_train]]

In [204]:
df_train_2 = deal_with_null_values(df_train_1,null_columns_train)
df_test_2 = deal_with_null_values(df_test_1,null_columns_test)

In [205]:
rem_columns_train = null_columns_all_train[[i not in null_columns_train for i in null_columns_all_train]]
rem_columns_test = null_columns_all_test[[i not in null_columns_test for i in null_columns_all_test]]

In [206]:
df_train_3 = df_train_2.copy()
for i in rem_columns:
    df_train_3[i] = df_train_3[i].fillna(df_train_3[i].mean())

df_test_3 = df_test_2.copy()
for i in rem_columns:
    df_test_3[i] = df_test_3[i].fillna(df_test_3[i].mean())

In [207]:
df_train_3

Unnamed: 0,city,year,weekofyear,month,station_max_temp_c,station_min_temp_c,station_precip_mm,station_diur_temp_rng_c,precipitation_amt_mm,reanalysis_dew_point_temp_k,...,reanalysis_specific_humidity_g_per_kg,reanalysis_precip_amt_kg_per_m2,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_tdtr_k,ndvi_se,ndvi_sw,ndvi_ne,ndvi_nw,total_cases
0,sj,1990,18,4,29.4,20.0,16.0,6.900000,12.42,292.414286,...,14.012857,32.00,299.8,295.9,2.628571,0.198483,0.177617,0.122600,0.103725,4
1,sj,1990,19,5,31.7,22.2,8.6,6.371429,22.82,293.951429,...,15.372857,17.94,300.9,296.4,2.371429,0.162357,0.155486,0.169900,0.142175,5
2,sj,1990,20,5,32.2,22.8,41.4,6.485714,34.54,295.434286,...,16.848571,26.10,300.5,297.3,2.300000,0.157200,0.170843,0.032250,0.172967,4
3,sj,1990,21,5,33.3,23.3,4.0,6.771429,15.36,295.310000,...,16.672857,13.90,301.4,297.0,2.428571,0.227557,0.235886,0.128633,0.245067,3
4,sj,1990,22,5,35.0,23.9,5.8,9.371429,7.52,295.821429,...,17.210000,12.20,301.9,297.5,3.014286,0.251200,0.247340,0.196200,0.262200,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,iq,2010,21,5,35.4,22.4,27.0,11.933333,55.30,296.825714,...,18.485714,45.00,309.7,294.5,9.800000,0.256343,0.292514,0.342750,0.318900,5
1452,iq,2010,22,6,34.7,21.7,36.6,10.500000,86.47,296.452857,...,18.070000,207.10,308.5,291.9,7.471429,0.136043,0.225657,0.160157,0.160371,8
1453,iq,2010,23,6,32.2,19.2,7.4,6.900000,58.94,295.501429,...,17.008571,50.60,305.5,292.4,7.500000,0.250357,0.233714,0.247057,0.146057,1
1454,iq,2010,24,6,31.2,21.0,16.0,8.733333,59.67,295.324286,...,16.815714,62.33,306.1,291.9,7.871429,0.278886,0.325486,0.333914,0.245771,1


In [208]:
df_test_3

Unnamed: 0,city,year,weekofyear,month,station_max_temp_c,station_min_temp_c,station_precip_mm,station_diur_temp_rng_c,precipitation_amt_mm,reanalysis_dew_point_temp_k,reanalysis_relative_humidity_percent,reanalysis_specific_humidity_g_per_kg,reanalysis_precip_amt_kg_per_m2,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_tdtr_k,ndvi_se,ndvi_sw,ndvi_ne,ndvi_nw
0,sj,2008,18,4,33.3,21.7,75.2,7.057143,78.60,294.527143,78.781429,15.918571,25.37,301.1,296.4,3.128571,0.102729,0.091200,-0.018900,-0.018900
1,sj,2008,19,5,30.0,22.2,34.3,5.557143,12.56,294.395714,78.230000,15.791429,21.83,300.8,296.7,2.571429,0.082043,0.072314,-0.018000,-0.012400
2,sj,2008,20,5,32.8,22.8,3.0,7.785714,3.66,295.308571,78.270000,16.674286,4.12,302.2,296.4,4.428571,0.151083,0.091529,-0.001500,0.126803
3,sj,2008,21,5,33.3,24.4,0.3,6.271429,0.00,294.402857,73.015714,15.775714,2.20,303.0,296.9,4.342857,0.124329,0.125686,0.126050,-0.019867
4,sj,2008,22,5,33.3,23.3,84.1,7.085714,0.76,294.760000,74.084286,16.137143,4.36,302.3,297.3,3.542857,0.062267,0.075914,0.056800,0.039833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,iq,2013,22,5,32.6,21.8,33.0,9.050000,41.12,295.638571,89.990000,17.185714,67.60,305.5,292.7,10.100000,0.280629,0.383186,0.301471,0.380029
412,iq,2013,23,6,33.8,21.4,68.0,10.720000,71.52,295.845714,93.891429,17.448571,45.70,306.3,291.6,9.657143,0.285371,0.350357,0.247600,0.296343
413,iq,2013,24,6,32.6,21.6,93.2,10.075000,78.96,294.894286,94.967143,16.410000,45.22,304.6,290.7,7.385714,0.252586,0.249771,0.238729,0.251029
414,iq,2013,25,6,32.2,21.8,34.1,8.480000,39.54,293.648571,89.057143,15.137143,4.70,305.9,292.5,8.228571,0.406614,0.403943,0.310429,0.302700


In [209]:
X_train = df_train_3.drop(columns=["city","year","weekofyear","month","total_cases"])
y_train = df_train_3["total_cases"]
X_test = df_test_3.drop(columns=["city","year","weekofyear","month"])

In [210]:
cols_train_2 = rem_columns_train.copy()
cols_test_2 = rem_columns_test.copy()

cols_train_1 = X_train.columns.values[[i not in cols_train_2 for i in X_train.columns.values]]
cols_test_1 = X_test.columns.values[[i not in cols_test_2 for i in X_test.columns.values]]

In [211]:
mms = MinMaxScaler()

X_train[cols_train_1] = mms.fit_transform(X_train[cols_train_1])
X_test[cols_test_1] = mms.fit_transform(X_test[cols_test_1])

In [212]:
ss = StandardScaler()

X_train[cols_train_2] = ss.fit_transform(X_train[cols_train_2])
X_test[cols_test_2] = ss.fit_transform(X_test[cols_test_2])

In [217]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_train,y_train)
y_pred = naive_bayes.predict(X_test)

In [218]:
df_final = pd.DataFrame(columns=["city","year","weekofyear","total_cases"])
df_final["city"] = test_input["city"].copy()
df_final["year"] = test_input["year"].copy()
df_final["weekofyear"] = test_input["weekofyear"].copy()
df_final["total_cases"] = y_pred

In [219]:
df_final

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,24
1,sj,2008,19,75
2,sj,2008,20,17
3,sj,2008,21,13
4,sj,2008,22,10
...,...,...,...,...
411,iq,2013,22,0
412,iq,2013,23,1
413,iq,2013,24,1
414,iq,2013,25,0


In [220]:
df_final.to_csv("amith_submission.csv",index=False,header=True)