In [31]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,StratifiedKFold,KFold,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,RobustScaler
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [32]:
df = pd.read_csv("weatherAUS.csv")


In [33]:
X = df.drop(["RainTomorrow","Date"], axis=1)
Y = df["RainTomorrow"]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size =0.2, random_state = 0)
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

In [35]:
categorical_cols = []
categorical_inds = []
counting_cols = []
counting_inds  = []
cnt=0
for i in X.columns:
    cnt+=1
    if df[i].dtype!=float:
        categorical_cols.append(i)
        categorical_inds.append(cnt-1)
    

      
    else:
        counting_cols.append(i)
        counting_inds.append(cnt-1)
print(categorical_cols)
print(counting_cols)

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']


In [36]:
#Numerical
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])


#Categorical
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore')),
])



preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, counting_inds),
        ('cat', categorical_transformer, categorical_inds)
    ],remainder='passthrough'
)

In [37]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)


In [38]:
params = {
            'objective':'binary:logistic',
            'max_depth': 16,
            'alpha': 10,
            'learning_rate': 0.5,
            'n_estimators':500
        }        
classifier = XGBClassifier(**params)


In [39]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

In [40]:
from sklearn import set_config
set_config(display='diagram')
pipe

In [41]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)


In [44]:
import pickle

pickle.dump(pipe,open('pipe.pkl','wb'))

In [45]:
pipe = pickle.load(open('pipe.pkl','rb'))

In [46]:
X.iloc[103,:].values

array(['Albury', 16.4, 27.0, 3.0, nan, nan, 'SW', 69.0, 'E', 'N', 9.0,
       22.0, 82.0, 74.0, 1012.7, 1008.0, nan, 4.0, 19.9, 19.7, 'Yes'],
      dtype=object)

In [47]:
Y.iloc[103]

'Yes'

In [48]:
pipe.predict(X.iloc[103,:].values.reshape(1,21))

array([1], dtype=int64)