In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import *
from sklearn.naive_bayes import *

warnings.filterwarnings("ignore")

In [2]:
train_input = pd.read_csv("dengue_features_train.csv")
train_target = pd.read_csv("dengue_labels_train.csv")
test_input = pd.read_csv("dengue_features_test.csv")
submission_format = pd.read_csv("submission_format.csv")

In [3]:
proper_order = ["city","year","weekofyear","week_start_date","station_max_temp_c","station_min_temp_c","station_avg_temp_c","station_precip_mm","station_diur_temp_rng_c","precipitation_amt_mm","reanalysis_sat_precip_amt_mm","reanalysis_dew_point_temp_k","reanalysis_air_temp_k","reanalysis_relative_humidity_percent","reanalysis_specific_humidity_g_per_kg","reanalysis_precip_amt_kg_per_m2","reanalysis_max_air_temp_k","reanalysis_min_air_temp_k","reanalysis_avg_temp_k","reanalysis_tdtr_k","ndvi_se","ndvi_sw","ndvi_ne","ndvi_nw"]

In [4]:
train_input = train_input[proper_order]
test_input = test_input[proper_order]

In [5]:
train_input

Unnamed: 0,city,year,weekofyear,week_start_date,station_max_temp_c,station_min_temp_c,station_avg_temp_c,station_precip_mm,station_diur_temp_rng_c,precipitation_amt_mm,...,reanalysis_specific_humidity_g_per_kg,reanalysis_precip_amt_kg_per_m2,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_avg_temp_k,reanalysis_tdtr_k,ndvi_se,ndvi_sw,ndvi_ne,ndvi_nw
0,sj,1990,18,1990-04-30,29.4,20.0,25.442857,16.0,6.900000,12.42,...,14.012857,32.00,299.8,295.9,297.742857,2.628571,0.198483,0.177617,0.122600,0.103725
1,sj,1990,19,1990-05-07,31.7,22.2,26.714286,8.6,6.371429,22.82,...,15.372857,17.94,300.9,296.4,298.442857,2.371429,0.162357,0.155486,0.169900,0.142175
2,sj,1990,20,1990-05-14,32.2,22.8,26.714286,41.4,6.485714,34.54,...,16.848571,26.10,300.5,297.3,298.878571,2.300000,0.157200,0.170843,0.032250,0.172967
3,sj,1990,21,1990-05-21,33.3,23.3,27.471429,4.0,6.771429,15.36,...,16.672857,13.90,301.4,297.0,299.228571,2.428571,0.227557,0.235886,0.128633,0.245067
4,sj,1990,22,1990-05-28,35.0,23.9,28.942857,5.8,9.371429,7.52,...,17.210000,12.20,301.9,297.5,299.664286,3.014286,0.251200,0.247340,0.196200,0.262200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,iq,2010,21,2010-05-28,35.4,22.4,28.633333,27.0,11.933333,55.30,...,18.485714,45.00,309.7,294.5,300.771429,9.800000,0.256343,0.292514,0.342750,0.318900
1452,iq,2010,22,2010-06-04,34.7,21.7,27.433333,36.6,10.500000,86.47,...,18.070000,207.10,308.5,291.9,299.392857,7.471429,0.136043,0.225657,0.160157,0.160371
1453,iq,2010,23,2010-06-11,32.2,19.2,24.400000,7.4,6.900000,58.94,...,17.008571,50.60,305.5,292.4,297.592857,7.500000,0.250357,0.233714,0.247057,0.146057
1454,iq,2010,24,2010-06-18,31.2,21.0,25.433333,16.0,8.733333,59.67,...,16.815714,62.33,306.1,291.9,297.521429,7.871429,0.278886,0.325486,0.333914,0.245771


In [6]:
train_input_1 = train_input.copy()
train_input_1["month"] = pd.to_datetime(train_input_1["week_start_date"]).dt.month

test_input_1 = test_input.copy()
test_input_1["month"] = pd.to_datetime(test_input_1["week_start_date"]).dt.month

In [7]:
def replace_weather_parameters_with_null_values(df,cols):
    for col in cols:
        null_months = df[["year","month"]][df[col].isna()].value_counts().reset_index()[["year","month"]].sort_values(by=["year","month"]).reset_index(drop=True)
        for i in null_months.index.values:
            year = null_months.loc[i,"year"]
            month = null_months.loc[i,"month"]
            cond1 = df["year"] == year
            cond2 = df["month"] == month
            mean_value = df[col][cond1 & cond2].mean()
            df[col][cond1 & cond2] = df[col][cond1 & cond2].fillna(mean_value)
    return df

In [8]:
co = train_input.isna().sum().reset_index()
weather_features = co["index"][co[0]>0].reset_index(drop=True).values[:-4]
ndvi_features = co["index"][co[0]>0].reset_index(drop=True).values[-4:]

In [9]:
train_input_2 = replace_weather_parameters_with_null_values(train_input_1,weather_features)

test_input_2 = replace_weather_parameters_with_null_values(test_input_1,weather_features)

In [10]:
ndvi_features

array(['ndvi_se', 'ndvi_sw', 'ndvi_ne', 'ndvi_nw'], dtype=object)

In [11]:
for i in ndvi_features:
    train_input_2[i] = train_input_2[i].fillna(train_input_2[i].mean())
    test_input_2[i] = test_input_2[i].fillna(test_input_2[i].mean())

In [19]:
X_train = train_input_2.drop(columns=["city","year","weekofyear","week_start_date","month"])
X_test = test_input_2.drop(columns=["city","year","weekofyear","week_start_date","month"])
y_train = train_target["total_cases"].copy()

mms = MinMaxScaler()
X_train[weather_features] = mms.fit_transform(X_train[weather_features])
X_test[weather_features] = mms.fit_transform(X_test[weather_features])

ss = StandardScaler()
X_train[ndvi_features] = ss.fit_transform(X_train[ndvi_features])
X_test[ndvi_features] = ss.fit_transform(X_test[ndvi_features])

In [21]:
y_train

0       4
1       5
2       4
3       3
4       6
       ..
1451    5
1452    8
1453    1
1454    1
1455    4
Name: total_cases, Length: 1456, dtype: int64

In [24]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

In [26]:
y_pred = naive_bayes.predict(X_test)

In [28]:
df_final = pd.DataFrame(columns=submission_format.columns)
df_final["city"] = test_input["city"].copy()
df_final["year"] = test_input["year"].copy()
df_final["weekofyear"] = test_input["weekofyear"].copy()
df_final["total_cases"] = y_pred

In [29]:
df_final

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,24
1,sj,2008,19,17
2,sj,2008,20,17
3,sj,2008,21,13
4,sj,2008,22,10
...,...,...,...,...
411,iq,2013,22,0
412,iq,2013,23,1
413,iq,2013,24,1
414,iq,2013,25,0


In [30]:
df_final.to_csv("amith_submission.csv",index=False,header=True)