In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from xgboost import XGBClassifier

In [3]:
train_X = pd.read_csv('train.csv')
train_X.drop(columns='id', inplace=True)
train_X.head()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [4]:
test_X = pd.read_csv('test.csv')
test_X.drop(columns='id', inplace=True)
test_X.head()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [5]:
normal_pressure = 1013.25
train_X['pressure'] -= normal_pressure
test_X['pressure'] -= normal_pressure

In [6]:
train_X.rename(columns={'temparature': 'temp'}, inplace=True)
test_X.rename(columns={'temparature': 'temp'}, inplace=True)

In [7]:
train_X['dew_excess'] = train_X['temp'] - train_X['dewpoint']
test_X['dew_excess'] = test_X['temp'] - test_X['dewpoint']

In [8]:
train_X['rainfall'].value_counts()

rainfall
1    1650
0     540
Name: count, dtype: int64

In [9]:
nulls = train_X[train_X['rainfall'] == 0]
train_X = pd.concat([train_X] + 2 * [nulls])

In [10]:
train_X.head(10)

Unnamed: 0,day,pressure,maxtemp,temp,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,dew_excess
0,1,4.15,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1,1.2
1,2,6.25,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1,1.5
2,3,10.85,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1,6.8
3,4,0.15,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1,1.0
4,5,8.55,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0,8.8
5,6,9.45,20.6,18.6,16.5,12.5,79.0,81.0,0.0,20.0,15.7,1,6.1
6,7,9.55,19.5,18.4,15.3,11.3,56.0,46.0,7.6,20.0,28.4,0,7.1
7,8,6.45,15.8,13.6,12.7,11.8,96.0,100.0,0.0,50.0,52.8,1,1.8
8,9,4.15,17.6,16.5,15.6,12.5,86.0,100.0,0.0,50.0,37.5,1,4.0
9,10,12.15,16.5,14.4,12.0,8.6,77.0,84.0,1.0,50.0,38.3,0,5.8


In [11]:
train_X.to_csv('preprocessed/train1.csv', index=False)
test_X.to_csv('preprocessed/test1.csv', index=False)

In [12]:
train_y = train_X['rainfall']
train_X.drop(columns='rainfall', inplace=True)

In [13]:
model = XGBClassifier(
    learning_rate=0.01,
    max_depth=7,
    objective='binary:logistic'
)

In [14]:
model.fit(train_X, train_y)

In [15]:
pred = list(map(lambda t: t[1], model.predict_proba(test_X)))

In [16]:
sub = pd.read_csv('sample_submission.csv')

In [17]:
sub['rainfall'] = pred
sub.to_csv('subs/xgbsub1.csv', index=False)

In [18]:
model = XGBClassifier(
    learning_rate=0.02,
    max_depth=13,
    objective='binary:logistic'
)
model.fit(train_X, train_y)
pred = list(map(lambda t: t[1], model.predict_proba(test_X)))
sub['rainfall'] = pred
sub.to_csv('subs/xgbsub2.csv', index=False)