In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [3]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [4]:
train_data = train_data.drop(columns=['id', 'day'])
test_data = test_data.drop(columns=['id', 'day'])

train_data['dewdiff'] = train_data['temparature'] - train_data['dewpoint']
test_data['dewdiff'] = test_data['temparature'] - test_data['dewpoint']

In [5]:
train_data.isna().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dewdiff          0
dtype: int64

In [6]:
rainfall = train_data['rainfall']
train_data = train_data.drop(columns='rainfall')

In [7]:
test_data.isna().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dewdiff          0
dtype: int64

In [8]:
test_winddir_mode = test_data['winddirection'].mode()[0]
test_data['winddirection'] = test_data['winddirection'].fillna(test_winddir_mode)

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
model = RandomForestClassifier(n_estimators=1_000, max_depth=7)
model.fit(train_data, rainfall)
pred = model.predict_proba(test_data)
pred

array([[0.01876978, 0.98123022],
       [0.0170199 , 0.9829801 ],
       [0.1641155 , 0.8358845 ],
       ...,
       [0.04119581, 0.95880419],
       [0.01706696, 0.98293304],
       [0.09125945, 0.90874055]], shape=(730, 2))

In [11]:
sub = pd.read_csv('sample_submission.csv')
sub.head()

Unnamed: 0,id,rainfall
0,2190,0
1,2191,0
2,2192,0
3,2193,0
4,2194,0


In [12]:
sub['rainfall'] = list(map(lambda t: t[1], pred))

In [13]:
sub.to_csv('subs/randforestsub.csv', index=False)

In [14]:
winddirs = pd.get_dummies(train_data['winddirection'], prefix='_', prefix_sep='')
train_data = train_data.drop(columns='winddirection')
train_data = pd.concat([train_data, winddirs], axis=1)
train_data

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,windspeed,dewdiff,...,_220.0,_230.0,_240.0,_250.0,_250.3,_260.0,_270.0,_280.0,_290.0,_300.0
0,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,17.2,1.2,...,False,False,False,False,False,False,False,False,False,False
1,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,21.9,1.5,...,False,False,False,False,False,False,False,False,False,False
2,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,18.1,6.8,...,False,False,False,False,False,False,False,False,False,False
3,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,35.6,1.0,...,False,False,False,False,False,False,False,False,False,False
4,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,24.8,8.8,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,22.1,0.7,...,False,False,False,False,False,False,False,False,False,False
2186,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,35.3,2.0,...,False,False,False,False,False,False,False,False,False,False
2187,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,32.9,3.7,...,False,False,False,False,False,False,False,False,False,False
2188,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,18.0,0.5,...,False,False,False,False,False,False,False,False,False,False


In [15]:
test_data = pd.concat([test_data, winddirs.iloc[:test_data.shape[0]]], axis=1)
test_data

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,...,_220.0,_230.0,_240.0,_250.0,_250.3,_260.0,_270.0,_280.0,_290.0,_300.0
0,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3,...,False,False,False,False,False,False,False,False,False,False
1,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3,...,False,False,False,False,False,False,False,False,False,False
2,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9,...,False,False,False,False,False,False,False,False,False,False
3,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6,...,False,False,False,False,False,False,False,False,False,False
4,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,1020.8,18.2,17.6,16.1,13.7,96.0,95.0,0.0,20.0,34.3,...,False,False,False,False,False,False,False,False,False,False
726,1011.7,23.2,18.1,16.0,16.0,78.0,80.0,1.6,40.0,25.2,...,False,False,False,False,False,False,False,False,False,False
727,1022.7,21.0,18.5,17.0,15.5,92.0,96.0,0.0,50.0,21.9,...,False,False,False,False,False,False,False,False,False,False
728,1014.4,21.0,20.0,19.7,19.8,94.0,93.0,0.0,50.0,39.5,...,False,False,False,False,False,False,False,False,False,False


In [16]:
winddirs

Unnamed: 0,_10.0,_15.0,_20.0,_25.0,_30.0,_40.0,_50.0,_60.0,_65.0,_70.0,...,_220.0,_230.0,_240.0,_250.0,_250.3,_260.0,_270.0,_280.0,_290.0,_300.0
0,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2186,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2187,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2188,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
for i in range(test_data.shape[0]):
    test_data.loc[i, '_10.0':] = False
    winddir = test_data.loc[i, 'winddirection']
    test_data.loc[i, '_' + str(winddir)] = True
test_data = test_data.drop(columns='winddirection')

In [18]:
model = RandomForestClassifier(n_estimators=1_000, max_depth=10)
model.fit(train_data, rainfall)
pred = model.predict_proba(test_data)
sub['rainfall'] = list(map(lambda t: t[1], pred))
sub.to_csv('subs/randforestsub1.csv', index=False)

In [19]:
from xgboost import XGBClassifier

In [20]:
model = XGBClassifier(
    n_estimators=1_000,
    learning_rate=0.01,
    max_depth=7,
    objective='binary:logistic'
)
model.fit(train_data, rainfall)

In [21]:
pred = model.predict_proba(test_data)
pred

array([[0.00262225, 0.99737775],
       [0.00141716, 0.99858284],
       [0.01609635, 0.98390365],
       ...,
       [0.00429082, 0.9957092 ],
       [0.00775427, 0.99224573],
       [0.03500003, 0.965     ]], shape=(730, 2), dtype=float32)

In [22]:
sub['rainfall'] = list(map(lambda t: t[1], pred))
sub.to_csv('subs/xgbsub3.csv', index=False)

In [23]:
model = RandomForestClassifier(n_estimators=10_000, max_depth=10)
model.fit(train_data, rainfall)
pred = model.predict_proba(test_data)
pred

array([[0.01799101, 0.98200899],
       [0.02000488, 0.97999512],
       [0.12362472, 0.87637528],
       ...,
       [0.04370106, 0.95629894],
       [0.01995285, 0.98004715],
       [0.12072481, 0.87927519]], shape=(730, 2))

In [24]:
sub['rainfall'] = list(map(lambda t: t[1], pred))
sub.to_csv('subs/randforestsub2.csv', index=False)

In [25]:
from sklearn.svm import SVC

In [26]:
model = SVC(probability=True)
model.fit(train_data, rainfall)
pred = model.predict_proba(test_data)
pred

array([[0.01534652, 0.98465348],
       [0.01370727, 0.98629273],
       [0.03507051, 0.96492949],
       ...,
       [0.0229436 , 0.9770564 ],
       [0.02422863, 0.97577137],
       [0.06369414, 0.93630586]], shape=(730, 2))

In [27]:
sub['rainfall'] = list(map(lambda t: t[1], pred))
sub.to_csv('subs/svmsub.csv', index=False)

In [28]:
model = SVC(gamma='auto', probability=True)
model.fit(train_data, rainfall)
pred = model.predict_proba(test_data)
sub['rainfall'] = list(map(lambda t: t[1], pred))
sub.to_csv('subs/svmsub1.csv', index=False)

In [29]:
model = SVC(gamma=0.1, probability=True)
model.fit(train_data, rainfall)
pred = model.predict_proba(test_data)
sub['rainfall'] = list(map(lambda t: t[1], pred))
sub.to_csv('subs/svmsub2.csv', index=False)

In [30]:
model = SVC(gamma=0.69, probability=True)
model.fit(train_data, rainfall)
pred = model.predict_proba(test_data)
sub['rainfall'] = list(map(lambda t: t[1], pred))
sub.to_csv('subs/svmsub3.csv', index=False)