In [4]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
#import plotly.express as px

from pandas_profiling import ProfileReport
from xgboost import XGBClassifier
from xgboost import plot_importance

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

## Import helper functions
#from ipynb.fs.defs.utils import *

In [5]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

TARGET = ["rain_tomorrow"]

# Load Data

In [6]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [7]:
train

Unnamed: 0,id,date,location,min_temp,max_temp,rainfall,evaporation,sunshine,wind_gust_dir,wind_gust_speed,...,humidity9am,humidity3pm,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,rain_today,rain_tomorrow
0,6364,2010-11-11,BadgerysCreek,16.2,28.9,11.4,,,SW,37.0,...,78.0,52.0,1017.2,1011.7,,,20.3,27.9,1.0,0
1,7985,2011-03-09,Sale,14.6,28.3,0.0,,,SSE,35.0,...,81.0,65.0,1012.9,1010.4,7.0,7.0,19.0,24.4,0.0,1
2,5021,2014-06-06,Nhil,2.6,16.3,0.0,,,S,31.0,...,93.0,61.0,1033.1,1031.0,,,5.9,14.6,0.0,0
3,28546,2017-06-07,Townsville,14.0,25.9,0.0,,,WNW,35.0,...,66.0,37.0,1020.4,1016.9,,,20.9,24.7,0.0,0
4,42222,2016-12-13,Uluru,16.7,41.7,0.0,,,SW,61.0,...,17.0,7.0,1006.1,1002.6,,,33.2,41.1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34186,32001,2010-12-12,Brisbane,20.4,27.2,80.4,,False,W,15.0,...,81.0,86.0,1008.8,1006.9,8.0,8.0,23.9,23.7,1.0,1
34187,11289,2013-07-05,MountGambier,5.9,13.4,27.6,,,W,76.0,...,88.0,51.0,1009.2,1010.1,7.0,5.0,7.5,12.5,1.0,1
34188,17825,2013-12-12,Richmond,14.8,28.0,0.0,,,ENE,35.0,...,65.0,39.0,1011.8,1008.0,,,22.2,26.9,0.0,0
34189,16033,2013-07-20,MelbourneAirport,5.2,8.4,4.2,,False,NNW,28.0,...,85.0,88.0,1003.6,1002.9,7.0,7.0,7.7,6.2,1.0,1


In [8]:
## Creata a combined data set to ensure train + test get same pre-proccessing
train['label'] = 'train'
test['label'] = 'test'
combined = train.append(test)

In [9]:
combined[TARGET].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48844 entries, 0 to 14652
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rain_tomorrow  34191 non-null  float64
dtypes: float64(1)
memory usage: 763.2 KB


In [10]:
combined[TARGET]

Unnamed: 0,rain_tomorrow
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14648,
14649,
14650,
14651,


# Pre-Processing

In [11]:
# Dates
combined['date'] = pd.to_datetime(train['date'])
combined['year'] = combined['date'].dt.year
combined['day'] = combined['date'].dt.day
combined['month'] = combined['date'].dt.month
combined['dayofyear'] = combined['date'].dt.dayofyear

In [12]:
# location
temp_count = combined[['location','rain_today']].groupby(['location'], as_index=False).count()
temp_sum = combined[['location','rain_today']].groupby(['location'], as_index=False).sum()
rain_location = temp_count.merge(temp_sum, on='location')
rain_location['prob'] = rain_location['rain_today_y'] / rain_location['rain_today_x']
rain_location[['location','prob']].merge(combined, on='location')
combined = rain_location[['location','prob']].merge(combined, on='location')

In [13]:
combined['temp_diff'] = combined['min_temp'] - combined['temp3pm']
combined['humidity_diff'] = combined['humidity9am'] - combined['humidity3pm']

## Define Features and Target
Define categorical and numeric features manually because sometimes cols of type float/int should be categorical and vice versa

In [14]:
combined.columns

Index(['location', 'prob', 'id', 'date', 'min_temp', 'max_temp', 'rainfall',
       'evaporation', 'sunshine', 'wind_gust_dir', 'wind_gust_speed',
       'wind_dir9am', 'wind_dir3pm', 'wind_speed9am', 'wind_speed3pm',
       'humidity9am', 'humidity3pm', 'pressure9am', 'pressure3pm', 'cloud9am',
       'cloud3pm', 'temp9am', 'temp3pm', 'rain_today', 'rain_tomorrow',
       'label', 'year', 'day', 'month', 'dayofyear', 'temp_diff',
       'humidity_diff'],
      dtype='object')

In [121]:
## helper to build feature list
all_categorical_features = ['location','rain_today','evaporation','sunshine',
                            'wind_gust_dir','wind_dir9am','wind_dir3pm','month',
                            'year','day','dayofyear','rain_tomorrow']

all_numeric_features = ['id', 'min_temp', 'max_temp', 'rainfall',
       'wind_gust_speed','wind_speed9am', 'wind_speed3pm',
       'humidity9am', 'humidity3pm', 'pressure9am', 'pressure3pm', 'cloud9am',
       'cloud3pm', 'temp9am', 'temp3pm',
       'label','prob','temp_diff','humidity_diff']

ts_features = ['date']

to_drop = ['location', 'prob', 'id', 'date', 'min_temp', 'max_temp', 'rainfall',
       'evaporation', 'sunshine', 'wind_gust_dir',
       'wind_dir9am', 'wind_dir3pm', 'wind_speed9am', 'wind_speed3pm',
        'pressure9am', 'pressure3pm',
        'temp9am','rain_tomorrow',
       'label', 'year', 'day', 'month', 'dayofyear',
       ]

to_keep = ['humidity9am', 'humidity3pm', 'cloud9am', 'cloud3pm', 'wind_gust_speed','temp3pm', 'humidity_diff', 'temp_diff','rain_today']


In [122]:
features,categorical_features,numeric_features = feature_selection(to_drop,all_categorical_features,all_numeric_features)

In [94]:
combined[categorical_features] = combined[categorical_features].astype(str)
combined[numeric_features] = combined[numeric_features].astype(float)

In [95]:
X = combined[combined['label'] == 'train'].drop(to_drop, axis=1)
y = combined[combined['label'] == 'train'][TARGET]
test = combined[combined['label'] == 'test'].drop(to_drop, axis=1)

# Create Pipeline

In [108]:
def split(data, to_drop, submission=False):
    if submission == True:
        print("Submission")
        X_train = combined[combined['label'] == 'train'].drop(to_drop, axis=1)
        y_train = combined[combined['label'] == 'train'][TARGET]
        y_train['rain_tomorrow'] = y_train['rain_tomorrow'].astype(float)
        y_train['rain_tomorrow'] = y_train['rain_tomorrow'].astype(int)
        X_test = combined[combined['label'] == 'test'].drop(to_drop, axis=1)
        y_test = [0,0,0]
    else:
        X = combined[combined['label'] == 'train'].drop(to_drop, axis=1)
        y = combined[combined['label'] == 'train'][TARGET]
        y['rain_tomorrow'] = y['rain_tomorrow'].astype(float)
        y['rain_tomorrow'] = y['rain_tomorrow'].astype(int)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
    
    numeric_transformer = Pipeline(steps=[
                            ('imputer', SimpleImputer(strategy='mean')),
                            ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
                                ('imputer', SimpleImputer(strategy='constant', fill_value=9)),
                                ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                #('ordinal', OrdinalEncoder(unknown_value=np.nan, handle_unknown='use_encoded_value'))
                                ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])


    classifier = XGBClassifier(
                    learning_rate = 0.1,
                     n_estimators= 1000,
                     max_depth= 4,
                     min_child_weight= 1,
                     #gamma=0.9,                        
                     #subsample=0.8,
                     #colsample_bytree=0.8,
                     #objective= 'binary:logistic',
                     #nthread= -1,
                     use_label_encoder=False,
                     #scale_pos_weight=1
                    )
    
    grid = GridSearchCV(classifier,
                param_grid={
                   'XGBClassifier__max_depth': [1,2,4,10],
                    'XGBClassifier__min_child_weight' : [1,5,10],
                    'XGBClassifier__n_estimators' : [100,500,1000,2000] ,
                     'XGBClassifier__learning_rate' : [.1,.01] ,
                            },
                 cv=2,
               refit=True)

    pipe = Pipeline(steps=[('preprocessor', preprocessor),       
                      ('classifier',classifier)])

    pipe.fit(
        X_train,
        y_train,
        classifier__eval_metric="logloss",
       )
    
    y_pred = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)
    
    if submission == False: 
        print(metrics.log_loss(y_test, y_pred))
        print(metrics.accuracy_score(y_test, y_pred))
    
    return y_pred

In [109]:
y_pred = split(combined,to_drop,submission=False)

5.550277166554575
0.8393039918116684


In [110]:
## BEST

In [111]:
y_pred = split(combined,to_drop,submission=False)

5.550277166554575
0.8393039918116684


# Submissions

In [112]:
y_pred = split(combined,to_drop,submission=True)

Submission


In [116]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [117]:
test = pd.read_csv(TEST_PATH)
test['rain_tomorrow'] = y_pred

In [118]:
submit = test[['id','rain_tomorrow']]

In [119]:
submit.to_csv('submission.csv', index=False)

In [120]:
! kaggle competitions submit -c sliced-s01e04-knyna9 -f submission.csv -m "no date, no feature engineer"

100%|█████████████████████████████████████████| 111k/111k [00:00<00:00, 127kB/s]
Successfully submitted to SLICED s01e04