In [20]:
import pandas as pd
import numpy as np

In [2]:
def get_missing_data(df):
    missing_data = (df.isnull().sum() / df.shape[0]) * 100
    return dict(missing_data)

In [3]:
def fill_missed_values(df):
    missing_data = get_missing_data(df)
    for miss in missing_data:
        if missing_data[miss] > 0:
            try:
                df[miss] = df[miss].fillna(df[miss].mean())
            except:
                df[miss] = df[miss].fillna('missed')
    return df

In [4]:
def drop_rain_missed(df):
    idx = df[
        (df['RainToday'] == 'missed') |
        (df['RainTomorrow'] == 'missed')
    ].index
    return df.drop(idx)

In [5]:
def encoding_data(df):
    for column in df.columns:
        if type(df[column][2] == str):
            df[column] = le.fit_transform(df[column])
    return df

In [6]:
train_ds = pd.read_csv('input/train.csv')
test_ds = pd.read_csv('input/test.csv')

In [7]:
train_ds.set_index('Id', inplace=True)
test_ds.set_index('Id', inplace=True)

In [8]:
train_ds = fill_missed_values(train_ds)
test_ds = fill_missed_values(test_ds)

In [9]:
data = pd.concat([train_ds, test_ds])
data.shape[0]

142193

In [10]:
data.drop(['Date', 'Location'], axis=1, inplace=True)
data.head(3)

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,7.4,25.1,0.0,5.464482,7.621446,WNW,44.0,NNW,WSW,4.0,...,44.0,25.0,1010.6,1007.8,4.425827,4.501143,17.2,24.3,No,No
3,12.9,25.7,0.0,5.464482,7.621446,WSW,46.0,W,WSW,19.0,...,38.0,30.0,1007.6,1008.7,4.425827,2.0,21.0,23.2,No,No
4,9.2,28.0,0.0,5.464482,7.621446,NE,24.0,SE,E,11.0,...,45.0,16.0,1017.6,1012.8,4.425827,4.501143,18.1,26.5,No,No


In [11]:
print(get_missing_data(data))

{'MinTemp': 0.0, 'MaxTemp': 0.0, 'Rainfall': 0.0, 'Evaporation': 0.0, 'Sunshine': 0.0, 'WindGustDir': 0.0, 'WindGustSpeed': 0.0, 'WindDir9am': 0.0, 'WindDir3pm': 0.0, 'WindSpeed9am': 0.0, 'WindSpeed3pm': 0.0, 'Humidity9am': 0.0, 'Humidity3pm': 0.0, 'Pressure9am': 0.0, 'Pressure3pm': 0.0, 'Cloud9am': 0.0, 'Cloud3pm': 0.0, 'Temp9am': 0.0, 'Temp3pm': 0.0, 'RainToday': 0.0, 'RainTomorrow': 0.0}


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [13]:
data_le = data
data_le.shape[0]

142193

In [14]:
data_le = drop_rain_missed(data_le)
data_le.shape[0]

140787

In [15]:
data_le = encoding_data(data_le)

In [17]:
X = data_le.drop('RainTomorrow', axis=1)
y = data_le['RainTomorrow']

In [19]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

folds = 3

kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1234)

model = xgb.XGBClassifier(objective='binary:logistic')

In [21]:
from sklearn.preprocessing import MinMaxScaler

np.random.seed(1234)

scaler = MinMaxScaler()

X[X.columns] = scaler.fit_transform(X[X.columns])