In [1]:
import pandas as pd
import numpy as np
import random

from IPython.core.interactiveshell import InteractiveShell  # print all outputs
InteractiveShell.ast_node_interactivity = "all"

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

# Classification methods
from sklearn.tree import DecisionTreeClassifier

# Evaluation metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Discretization 
from sklearn.preprocessing import KBinsDiscretizer

%matplotlib inline

In [2]:
SEED = 1234
random.seed(SEED)

#### Loading data after Preprocessing 

In [19]:
df_train = pd.read_csv("data/processed_data.csv")

In [20]:
df_train

Unnamed: 0,SITE_ID,DATE,N_TRANSPORTED_SITES,GEOGRAPHIC_CLUSTER,aircon_sum_wo_prev7d,aircon_sum_wo_prev14d,aircon_sum_target_next14d,mean_temperature_prev7d,max_temperature_prev7d,min_temperature_prev7d,...,skew_fire/smoke_alarms_prev14d,skew_ge_alarms_prev14d,skew_power_alarms_prev14d,skew_temperature_alarms_prev14d,kurt_equipment_alarms_prev14d,kurt_fire/smoke_alarms_prev14d,kurt_ge_alarms_prev14d,kurt_power_alarms_prev14d,kurt_temperature_alarms_prev14d,CELL_TYPE
0,146,2019-04-10,3.0,8,0.0,0.0,0,10.29,14.0,6.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.212308,-1.212308,0
1,146,2019-04-11,3.0,8,0.0,0.0,0,11.71,16.0,9.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.212308,-1.212308,0
2,146,2019-04-12,3.0,8,0.0,0.0,0,11.57,16.0,9.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.212308,-1.212308,0
3,146,2019-04-13,3.0,8,0.0,0.0,0,11.29,16.0,8.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.212308,-1.212308,0
4,146,2019-04-14,3.0,8,0.0,0.0,0,10.57,16.0,5.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.212308,-1.212308,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615870,1251,2020-01-30,7.0,9,0.0,0.0,0,4.00,7.0,1.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.210000,-1.212308,0
615871,1251,2020-01-31,7.0,9,0.0,0.0,0,4.29,7.0,1.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.210000,-1.212308,0
615872,1251,2020-02-01,7.0,9,0.0,0.0,0,4.71,7.0,3.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.210000,-1.212308,0
615873,1251,2020-02-02,7.0,9,0.0,0.0,0,5.57,9.0,3.0,...,0.0,0.0,0.0,0.0,-1.212308,-1.212308,-1.212308,-1.210000,-1.212308,0


In [21]:
target_variable = 'aircon_sum_target_next14d'
input_variables = df_train.columns[(df_train.columns!= target_variable) & (df_train.columns!= 'DATE') & (df_train.columns!= 'SITE_ID')]

X = df_train[input_variables]
y = df_train[target_variable]

In [22]:
cv = StratifiedKFold(n_splits=10, shuffle=True)

In [23]:
no_intervals = 5

We are discretizing only the weather attributes

In [24]:
#Weather_to_discretize are all the weather features

weather_to_discrete = []
weathers = ['temperature', 'rain_mm', 'humidity', 'wind_speed', 'pressure']
operators = ['mean', 'max', 'min']

#weather prev

for w in weathers:
    for o in operators:
        for D in ['7', '3']:
            weather_to_discrete.append(o + '_' + w + '_prev'+D+'d')
        
#weather future
for w in weathers:
    for o in operators:
        for D in ['7', '14']:
            weather_to_discrete.append(o + '_' + w + '_f_next'+D+'d')
            

In [25]:
discretizations = {
     
    'Ordinal-Equal-Width':KBinsDiscretizer(n_bins=no_intervals, encode='ordinal', strategy='uniform'),
     
    'Ordinal-Equal-Frequency':KBinsDiscretizer(n_bins=no_intervals, encode='ordinal', strategy='quantile'),

    'Ordinal-Equal-kMeans':KBinsDiscretizer(n_bins=no_intervals, encode='ordinal', strategy='kmeans')
}

### Check the best discretization type

In [26]:
# Method to return a dataframe with the discretized values

# df_to_discretize is the dataframe with features/columns I want to convert (from continuous values ---> discrete values)
# original_df = the dataframe with all features
# Xd = discretized np-array

def discretized_to_df(Xd,df_to_discretize,original_df):
    scaled_features_df = pd.DataFrame(Xd, index=df_to_discretize.index, columns=df_to_discretize.columns)
    discretized_df = original_df.copy()
    for col in scaled_features_df.columns:
        if col in discretized_df.columns:
            discretized_df[col] = scaled_features_df[col]      
    return discretized_df 

In [None]:
performance = {}

p = cross_validate(DecisionTreeClassifier(random_state = SEED),X,y,cv=cv, scoring=['recall','f1'])
performance['No Discretization'] = (np.average(p['test_recall']),np.std(p['test_recall']), np.average(p['test_f1']),np.std(p['test_f1']))

for discretization_name in discretizations.keys():
    
    X_to_discretize = X[weather_to_discrete]     # Takes the sub-df only with features to discretize
    Xd = discretizations[discretization_name].fit_transform(X_to_discretize) # Creates a np-array with discretized values
    df = discretized_to_df(Xd, X_to_discretize,X) # creates a dataframe containing weather features and discretized values
        
    p = cross_validate(DecisionTreeClassifier(random_state = SEED),df,y,cv=cv, scoring=['recall','f1'])
    performance[discretization_name] = (np.average(p['test_recall']),np.std(p['test_recall']), np.average(p['test_f1']),np.std(p['test_f1']))

In [11]:
# Check the scores of cross-validation
for method in performance.keys():
    print("%12s\tRecall_Mean=%3.2f Recall_Std=%3.2f   F1_Mean=%3.2f F1_Std=%3.2f"%(method,performance[method][0],performance[method][1],performance[method][2],performance[method][3]))

No Discretization	Recall_Mean=0.60 Recall_Std=0.03   F1_Mean=0.63 F1_Std=0.02
Ordinal-Equal-Width	Recall_Mean=0.66 Recall_Std=0.03   F1_Mean=0.67 F1_Std=0.02
Ordinal-Equal-Frequency	Recall_Mean=0.65 Recall_Std=0.02   F1_Mean=0.66 F1_Std=0.02
Ordinal-Equal-kMeans	Recall_Mean=0.67 Recall_Std=0.02   F1_Mean=0.67 F1_Std=0.02




|Discretization type | Recall_Mean | Recall_Std | F1_Mean | F1_Std
|:---|---|---|---|---|
|No Discretization | 0.60  | 0.02 | 0.63 | 0.02
|Ordinal-Equal-Width | 0.66  | 0.03  | 0.67 | 0.02
|Ordinal-Equal-Frequency | 0.65  | 0.02 | 0.66 | 0.01
|Ordinal-Equal-kMeans | 0.64  | 0.03 | 0.65 | 0.02



#### Ordinal-Equal-Width is the best one ---> let's create the discretized dataframe

In [29]:
X_to_discretize = X[weather_to_discrete]     # Takes the sub-df only with features to discretize
Xd = discretizations['Ordinal-Equal-Width'].fit_transform(X_to_discretize) # Creates a np-array with discretized values
final_df = discretized_to_df(Xd, X_to_discretize,df_train) # creates a dataframe containing weather features with discretized values

### Save the dataframe

In [31]:
final_df.to_csv('data/processed_discrete_data.csv',index=False)