In [184]:
import numpy as np
import pandas as pd
import matplotlib 
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing, tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, roc_curve, auc
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#from preprocessing import preprocess
plt.rcParams["figure.dpi"] = 150

In [185]:
# from collections import Counter
# import requests

# with requests.get(
#     "https://docs.google.com/spreadsheets/d/1wduqo5WyYmCpaGnE81sLNGU0VSodIekMfpmEwU0fGqs/export?format=csv") as r, open("features.csv", "wb") as f:
#     for chunk in r.iter_content():
#         f.write(chunk)

# with requests.get(
#     "https://docs.google.com/spreadsheets/d/1gvZ03uAL6THwd04Y98GtIj6SeAHiKyQY5UisuuyFSUs/export?format=csv") as r, open("target.csv", "wb") as f:
#     for chunk in r.iter_content():
#         f.write(chunk)

In [186]:
df_features = pd.read_csv("features.csv")
df_target = pd.read_csv("target.csv")
df = df_features.merge(df_target, left_on='id', right_on='id')
df['direccion_viento_temprano'].value_counts()

  exec(code_obj, self.user_global_ns, self.user_ns)


Nornoreste       12784
Norte             9453
Sureste           7437
Este              7391
Sursureste        7244
Noroeste          7003
Sur               6948
suroeste          6785
Oeste             6742
Estenoreste       6244
Estesureste       6152
Noreste           6136
Sursuroeste       6073
Oestenoroeste     5964
Oestesuroeste     5609
Name: direccion_viento_temprano, dtype: int64

In [187]:
def basic_preprocessing(df: pd.DataFrame):
    df['presion_atmosferica_tarde'] = pd.to_numeric(df['presion_atmosferica_tarde'],errors='coerce')    
    df['dia'] = pd.to_datetime(df['dia'])
    df = df.dropna(subset=['llovieron_hamburguesas_al_dia_siguiente']) 
    df = df.dropna(subset=['llovieron_hamburguesas_hoy'])
    df = df.drop(columns = ['id', 'barrio', 'mm_evaporados_agua'])
    df['mes'] = df['dia'].dt.month
    df = df.drop(columns = ['dia']) 
    df = df[df.isnull().mean(1) < 0.4]
    
    label_encoder = preprocessing.LabelEncoder()
    
    label_encoder.fit(df['llovieron_hamburguesas_hoy'])
    df['llovieron_hamburguesas_hoy'] = label_encoder.transform(df['llovieron_hamburguesas_hoy'])

    label_encoder.fit(df['llovieron_hamburguesas_al_dia_siguiente'])
    df['llovieron_hamburguesas_al_dia_siguiente'] = label_encoder.transform(df['llovieron_hamburguesas_al_dia_siguiente'])
    
    return df

In [188]:
def preprocessing1(df: pd.DataFrame):
    
    df = basic_preprocessing(df)

    df = pd.get_dummies(df, drop_first=True, dummy_na=True, columns=['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde'])

    df['horas_de_sol'] = df['horas_de_sol'].fillna(df['horas_de_sol'].mean())
    df['nubosidad_tarde'] = df['nubosidad_tarde'].fillna(df['nubosidad_tarde'].mean()).astype('int')
    df['nubosidad_temprano'] = df['nubosidad_temprano'].fillna(df['nubosidad_temprano'].mean())
    df['presion_atmosferica_temprano'] = df['presion_atmosferica_temprano'].fillna(df['presion_atmosferica_temprano'].mean())
    df['presion_atmosferica_tarde'] = df['presion_atmosferica_tarde'].fillna(df['presion_atmosferica_tarde'].mean())
    df['rafaga_viento_max_velocidad'] = df['rafaga_viento_max_velocidad'].fillna(df['rafaga_viento_max_velocidad'].mean())
    df['humedad_tarde'] = df['humedad_tarde'].fillna(df['humedad_tarde'].mean())
    df['temperatura_tarde'] = df['temperatura_tarde'].fillna(df['temperatura_tarde'].mean())
    df['mm_lluvia_dia'] = df['mm_lluvia_dia'].fillna(df['mm_lluvia_dia'].median())
    df['velocidad_viendo_tarde'] = df['velocidad_viendo_tarde'].fillna(df['velocidad_viendo_tarde'].mean())
    df['humedad_temprano'] = df['humedad_temprano'].fillna(df['humedad_temprano'].mean())
    df['velocidad_viendo_temprano'] = df['velocidad_viendo_temprano'].fillna(df['velocidad_viendo_temprano'].mean())
    df['temperatura_temprano'] = df['temperatura_temprano'].fillna(df['temperatura_temprano'].mean())
    df['temp_min'] = df['temp_min'].fillna(df['temp_min'].mean())
    df['temp_max'] = df['temp_max'].fillna(df['temp_max'].mean())
    
    return df


In [189]:
def preprocessing2(df: pd.DataFrame):
    df = basic_preprocessing(df)


    df = df.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes', 'llovieron_hamburguesas_hoy']) 

    df['horas_de_sol'] = df['horas_de_sol'].fillna(df['horas_de_sol'].mean())
    df['nubosidad_tarde'] = df['nubosidad_tarde'].fillna(df['nubosidad_tarde'].mean())
    df['nubosidad_temprano'] = df['nubosidad_temprano'].fillna(df['nubosidad_temprano'].mean())
    df['presion_atmosferica_temprano'] = df['presion_atmosferica_temprano'].fillna(df['presion_atmosferica_temprano'].mean())
    df['presion_atmosferica_tarde'] = df['presion_atmosferica_tarde'].fillna(df['presion_atmosferica_tarde'].mean())
    df['rafaga_viento_max_velocidad'] = df['rafaga_viento_max_velocidad'].fillna(df['rafaga_viento_max_velocidad'].mean())
    df['humedad_tarde'] = df['humedad_tarde'].fillna(df['humedad_tarde'].mean())
    df['temperatura_tarde'] = df['temperatura_tarde'].fillna(df['temperatura_tarde'].mean())
    df['mm_lluvia_dia'] = df['mm_lluvia_dia'].fillna(df['mm_lluvia_dia'].median())
    df['velocidad_viendo_tarde'] = df['velocidad_viendo_tarde'].fillna(df['velocidad_viendo_tarde'].mean())
    df['humedad_temprano'] = df['humedad_temprano'].fillna(df['humedad_temprano'].mean())
    df['velocidad_viendo_temprano'] = df['velocidad_viendo_temprano'].fillna(df['velocidad_viendo_temprano'].mean())
    df['temperatura_temprano'] = df['temperatura_temprano'].fillna(df['temperatura_temprano'].mean())
    df['temp_min'] = df['temp_min'].fillna(df['temp_min'].mean())
    df['temp_max'] = df['temp_max'].fillna(df['temp_max'].mean())

    return df

In [190]:
def preprocessing3(df: pd.DataFrame):
    df = basic_preprocessing(df)

    df = pd.get_dummies(df, drop_first=True, dummy_na=True, columns=['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde'])
    df = df.drop(columns = ['horas_de_sol', 'nubosidad_tarde', 'nubosidad_temprano']) 

    df['presion_atmosferica_temprano'] = df['presion_atmosferica_temprano'].fillna(df['presion_atmosferica_temprano'].mean())
    df['presion_atmosferica_tarde'] = df['presion_atmosferica_tarde'].fillna(df['presion_atmosferica_tarde'].mean())
    df['rafaga_viento_max_velocidad'] = df['rafaga_viento_max_velocidad'].fillna(df['rafaga_viento_max_velocidad'].mean())
    df['humedad_tarde'] = df['humedad_tarde'].fillna(df['humedad_tarde'].mean())
    df['temperatura_tarde'] = df['temperatura_tarde'].fillna(df['temperatura_tarde'].mean())
    df['mm_lluvia_dia'] = df['mm_lluvia_dia'].fillna(df['mm_lluvia_dia'].median())
    df['velocidad_viendo_tarde'] = df['velocidad_viendo_tarde'].fillna(df['velocidad_viendo_tarde'].mean())
    df['humedad_temprano'] = df['humedad_temprano'].fillna(df['humedad_temprano'].mean())
    df['velocidad_viendo_temprano'] = df['velocidad_viendo_temprano'].fillna(df['velocidad_viendo_temprano'].mean())
    df['temperatura_temprano'] = df['temperatura_temprano'].fillna(df['temperatura_temprano'].mean())
    df['temp_min'] = df['temp_min'].fillna(df['temp_min'].mean())
    df['temp_max'] = df['temp_max'].fillna(df['temp_max'].mean())
    
    return df

In [191]:
def preprocessing4(df: pd.DataFrame):
    df = basic_preprocessing(df)


    df = df.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes',
                           'horas_de_sol', 'nubosidad_tarde', 'nubosidad_temprano', 'temperatura_temprano', 'presion_atmosferica_temprano',
                           'velocidad_viendo_temprano', 'temperatura_temprano', 'temp_min', 'temp_max']) 

    df['presion_atmosferica_tarde'] = df['presion_atmosferica_tarde'].fillna(df['presion_atmosferica_tarde'].mean())
    df['rafaga_viento_max_velocidad'] = df['rafaga_viento_max_velocidad'].fillna(df['rafaga_viento_max_velocidad'].mean())
    df['humedad_tarde'] = df['humedad_tarde'].fillna(df['humedad_tarde'].mean())
    df['temperatura_tarde'] = df['temperatura_tarde'].fillna(df['temperatura_tarde'].mean())
    df['mm_lluvia_dia'] = df['mm_lluvia_dia'].fillna(df['mm_lluvia_dia'].median())
    df['velocidad_viendo_tarde'] = df['velocidad_viendo_tarde'].fillna(df['velocidad_viendo_tarde'].mean())
    df['humedad_temprano'] = df['humedad_temprano'].fillna(df['humedad_temprano'].mean())

    
    return df

In [192]:
def split(df):
    
    X = df.drop(columns=['llovieron_hamburguesas_al_dia_siguiente'])
    y = df['llovieron_hamburguesas_al_dia_siguiente']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=117, test_size=0.1, stratify=y)

    return X_train, X_test, y_train, y_test




In [193]:
def plot_roc(_fpr, _tpr, x):

    roc_auc = auc(_fpr, _tpr)

    plt.figure(figsize=(15, 10))
    plt.plot(
        _fpr, _tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})'
    )
    plt.scatter(_fpr, x)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()



In [194]:
df1 = preprocessing1(df)
X_train, X_test, y_train, y_test = split(df1)
params = {
    'max_depth': np.arange(1, 15),
    'min_samples_leaf': np.arange(1, 300),
    "criterion": ["gini", "entropy"],
}

clf = tree.DecisionTreeClassifier(random_state=117)

rgscv = RandomizedSearchCV(
    clf, params, n_iter = 180, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True
).fit(X_train, y_train)

print(f"Best score: {rgscv.best_score_}")
print(f"Best params {rgscv.best_params_}")

KeyboardInterrupt: 

In [None]:
df2 = preprocessing2(df)
X_train, X_test, y_train, y_test = split(df2)
params = {
    'max_depth': np.arange(1, 15),
    'min_samples_leaf': np.arange(1, 300),
    "criterion": ["gini", "entropy"],
}

clf = tree.DecisionTreeClassifier(random_state=117)

rgscv = RandomizedSearchCV(
    clf, params, n_iter = 180, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True
).fit(X_train, y_train)

print(f"Best score: {rgscv.best_score_}")
print(f"Best params {rgscv.best_params_}")

In [None]:
df3 = preprocessing3(df)
X_train, X_test, y_train, y_test = split(df3)
params = {
    'max_depth': np.arange(1, 15),
    'min_samples_leaf': np.arange(1, 300),
    "criterion": ["gini", "entropy"],
}

clf = tree.DecisionTreeClassifier(random_state=117)

rgscv = RandomizedSearchCV(
    clf, params, n_iter = 180, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True
).fit(X_train, y_train)

print(f"Best score: {rgscv.best_score_}")
print(f"Best params {rgscv.best_params_}")

In [None]:
df4 = preprocessing4(df)
X_train, X_test, y_train, y_test = split(df4)
params = {
    'max_depth': np.arange(1, 15),
    'min_samples_leaf': np.arange(1, 300),
    "criterion": ["gini", "entropy"],
}

clf = tree.DecisionTreeClassifier(random_state=117)

rgscv = RandomizedSearchCV(
    clf, params, n_iter = 180, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True
).fit(X_train, y_train)

print(f"Best score: {rgscv.best_score_}")
print(f"Best params {rgscv.best_params_}")

In [None]:
df1 = preprocessing1(df)
X_train, X_test, y_train, y_test = split(df1)


clf = tree.DecisionTreeClassifier(random_state=117, max_depth=11, min_samples_leaf=143)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"Auc Roc: {roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])}")


In [None]:
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
plot_roc(fpr, tpr, thresholds)

In [None]:
import dtreeviz.trees as dtreeviz
import graphviz
import ipywidgets as widgets
from IPython.display import SVG, display
from ipywidgets import Button, IntSlider, interactive
dot_data = tree.export_graphviz(
        clf,
        out_file=None,
        feature_names=X_train.columns,
        filled=True,
        rounded=True,
        special_characters=True,
    )
graph = graphviz.Source(dot_data)
display(SVG(graph.pipe(format='svg')))

In [None]:
sorted(list(zip(X.columns, clf.feature_importances_)), key=lambda x: -x[1])