In [2]:
import numpy as np
import pandas as pd
import matplotlib 
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing, tree
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, roc_curve, auc
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

#from preprocessing import preprocess
plt.rcParams["figure.dpi"] = 150

In [3]:
from collections import Counter
import requests

with requests.get(
    "https://docs.google.com/spreadsheets/d/1wduqo5WyYmCpaGnE81sLNGU0VSodIekMfpmEwU0fGqs/export?format=csv") as r, open("features.csv", "wb") as f:
    for chunk in r.iter_content():
        f.write(chunk)

with requests.get(
    "https://docs.google.com/spreadsheets/d/1gvZ03uAL6THwd04Y98GtIj6SeAHiKyQY5UisuuyFSUs/export?format=csv") as r, open("target.csv", "wb") as f:
    for chunk in r.iter_content():
        f.write(chunk)

In [4]:
df_features = pd.read_csv("features.csv")
df_target = pd.read_csv("target.csv")
df = df_features.merge(df_target, left_on='id', right_on='id')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
def basic_preprocessing(df: pd.DataFrame, test_size=0.1):
    df['presion_atmosferica_tarde'] = pd.to_numeric(df['presion_atmosferica_tarde'],errors='coerce')    
    df['dia'] = pd.to_datetime(df['dia'])
    df = df.dropna(subset=['llovieron_hamburguesas_al_dia_siguiente']) 
    df = df.dropna(subset=['llovieron_hamburguesas_hoy'])
    df = df.drop(columns = ['id'])
    df['mes'] = df['dia'].dt.month
    df = df.drop(columns = ['dia']) 
    df = df[df.isnull().mean(1) < 0.4]
    
    label_encoder = preprocessing.LabelEncoder()
    
    label_encoder.fit(df['llovieron_hamburguesas_hoy'])
    df['llovieron_hamburguesas_hoy'] = label_encoder.transform(df['llovieron_hamburguesas_hoy'])

    label_encoder.fit(df['llovieron_hamburguesas_al_dia_siguiente'])
    df['llovieron_hamburguesas_al_dia_siguiente'] = label_encoder.transform(df['llovieron_hamburguesas_al_dia_siguiente'])
    
    
    X = df.drop(columns=['llovieron_hamburguesas_al_dia_siguiente'])
    y = df['llovieron_hamburguesas_al_dia_siguiente']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=117, test_size=test_size, stratify=y.astype(str))
    
    numerical_features = ['horas_de_sol', 'nubosidad_tarde', 'nubosidad_temprano', 'presion_atmosferica_temprano', 'presion_atmosferica_tarde',
                       'rafaga_viento_max_velocidad', 'humedad_tarde', 'temperatura_tarde', 'mm_lluvia_dia',
                       'velocidad_viendo_tarde', 'humedad_temprano', 'velocidad_viendo_temprano', 'temperatura_temprano',
                       'temp_min', 'temp_max', 'mm_evaporados_agua']
    
    for feature in numerical_features:
        if feature == 'mm_lluvia_dia' or feature == 'mm_evaporados_agua':
            X_train[feature] = X_train[feature].fillna(X_train[feature].median())
            X_test[feature] = X_test[feature].fillna(X_test[feature].median())
        else:
            X_train[feature] = X_train[feature].fillna(X_train[feature].mean())
            X_test[feature] = X_test[feature].fillna(X_test[feature].mean())
            
    
    return X_train, X_test, y_train, y_test

In [31]:
def preprocessing1(df: pd.DataFrame):
    X_train, X_test, y_train, y_test = basic_preprocessing(df)

    X_train = X_train.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes', 'barrio', 'llovieron_hamburguesas_hoy', 'velocidad_viendo_temprano', 'temperatura_temprano', 'mm_evaporados_agua'])  
    X_test = X_test.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes', 'barrio', 'llovieron_hamburguesas_hoy', 'velocidad_viendo_temprano', 'temperatura_temprano', 'mm_evaporados_agua'])  
    
    features = ['horas_de_sol', 'nubosidad_tarde', 'nubosidad_temprano', 'presion_atmosferica_temprano', 'presion_atmosferica_tarde', 'rafaga_viento_max_velocidad',
                                        'humedad_tarde', 'temperatura_tarde', 'mm_lluvia_dia', 'velocidad_viendo_tarde', 'humedad_temprano',
                                        'temp_min', 'temp_max']
    
    scaler = preprocessing.StandardScaler()

    X_train[features] = scaler.fit_transform(X_train[features])
    X_test[features] = scaler.fit_transform(X_test[features])
        
    return X_train, X_test, y_train, y_test

In [23]:
def preprocessing2(df: pd.DataFrame):
    X_train, X_test, y_train, y_test = basic_preprocessing(df)

    X_train = X_train.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes', 'barrio', 'llovieron_hamburguesas_hoy', 'velocidad_viendo_temprano', 'temperatura_temprano', 'mm_evaporados_agua'])  
    X_test = X_test.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes', 'barrio', 'llovieron_hamburguesas_hoy', 'velocidad_viendo_temprano', 'temperatura_temprano', 'mm_evaporados_agua'])  
    
    features = ['horas_de_sol', 'nubosidad_tarde', 'nubosidad_temprano', 'presion_atmosferica_temprano', 'presion_atmosferica_tarde', 'rafaga_viento_max_velocidad',
                                        'humedad_tarde', 'temperatura_tarde', 'mm_lluvia_dia', 'velocidad_viendo_tarde', 'humedad_temprano',
                                        'temp_min', 'temp_max']
    
    scaler = preprocessing.MinMaxScaler()

    X_train[features] = scaler.fit_transform(X_train[features])
    X_test[features] = scaler.fit_transform(X_test[features])
        
    return X_train, X_test, y_train, y_test

In [32]:
def preprocessing3(df: pd.DataFrame):
    X_train, X_test, y_train, y_test = basic_preprocessing(df)

    X_train = X_train.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes', 'barrio', 'llovieron_hamburguesas_hoy', 'velocidad_viendo_temprano', 'temperatura_temprano', 'mm_evaporados_agua'])  
    X_test = X_test.drop(columns = ['direccion_viento_temprano', 'rafaga_viento_max_direccion', 'direccion_viento_tarde', 'mes', 'barrio', 'llovieron_hamburguesas_hoy', 'velocidad_viendo_temprano', 'temperatura_temprano', 'mm_evaporados_agua'])  
    
    features = ['horas_de_sol', 'nubosidad_tarde', 'nubosidad_temprano', 'presion_atmosferica_temprano', 'presion_atmosferica_tarde', 'rafaga_viento_max_velocidad',
                                        'humedad_tarde', 'temperatura_tarde', 'mm_lluvia_dia', 'velocidad_viendo_tarde', 'humedad_temprano',
                                        'temp_min', 'temp_max']
    
    scaler = preprocessing.Normalizer()

    X_train[features] = scaler.fit_transform(X_train[features])
    X_test[features] = scaler.fit_transform(X_test[features])
        
    return X_train, X_test, y_train, y_test

In [18]:
def plot_roc(_fpr, _tpr, x):

    roc_auc = auc(_fpr, _tpr)

    plt.figure(figsize=(15, 10))
    plt.plot(
        _fpr, _tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})'
    )
    plt.scatter(_fpr, x)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()



In [19]:
X_train1, X_test1, y_train1, y_test1 = preprocessing1(df)

In [20]:
params = {
    'n_neighbors': np.arange(10, 150, 10),
    'weights': ['uniform', 'distance'],
}

knn1 = KNeighborsClassifier(algorithm='kd_tree')

gscv1 = GridSearchCV(
    knn1, params, scoring='roc_auc', n_jobs=-1, cv=3, return_train_score=True
).fit(X_train1, y_train1)

print(f"Best score: {gscv1.best_score_}")
print(f"Best params {gscv1.best_params_}")

KeyboardInterrupt: 

In [25]:
X_train2, X_test2, y_train2, y_test2 = preprocessing2(df)

In [26]:
params = {
    'n_neighbors': np.arange(10, 150, 10),
    'weights': ['uniform', 'distance'],
}

knn2 = KNeighborsClassifier(algorithm='kd_tree')

gscv2 = GridSearchCV(
    knn2, params, scoring='roc_auc', n_jobs=-1, cv=3, return_train_score=True
).fit(X_train2, y_train2)

print(f"Best score: {gscv2.best_score_}")
print(f"Best params {gscv2.best_params_}")

KeyboardInterrupt: 

In [29]:
X_train3, X_test3, y_train3, y_test3 = preprocessing3(df)

In [30]:
params = {
    'n_neighbors': np.arange(10, 150, 10),
    'weights': ['uniform', 'distance'],
}

knn3 = KNeighborsClassifier(algorithm='kd_tree')

gscv3 = GridSearchCV(
    knn3, params, scoring='roc_auc', n_jobs=-1, cv=3, return_train_score=True
).fit(X_train3, y_train3)

print(f"Best score: {gscv3.best_score_}")
print(f"Best params {gscv3.best_params_}")

KeyboardInterrupt: 

In [16]:
knn = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=100, weights='distance')
knn.fit(X_train1, y_train1)

y_pred = knn.predict(X_test1)
y_pred_proba = knn.predict_proba(X_test1)[:,1]

In [17]:
roc_auc_score(y_test1, y_pred_proba)

0.8761197379419964

In [18]:
accuracy_score(y_test1, y_pred)

0.8485448247942063

In [19]:
precision_score(y_test1, y_pred)

0.7702845100105374

In [20]:
recall_score(y_test1, y_pred)

0.44700366897676314

In [192]:
knn2 = KNeighborsClassifier(metric = 'euclidean', weights='uniform', n_neighbors=10)
knn2.fit(X_train2, y_train2)
#y_pred = knn2.predict(X_test2)
y_pred_proba = knn2.predict_proba(X_test2)[:,1]
roc_auc_score(y_test2, y_pred_proba)


0.8484758951697395

0.8276730691376906

0.8276730691376906

0.6902604756511891

0.49694251936404404

In [55]:
y_test2

93060     0
33296     0
58500     0
25709     0
25260     0
         ..
56188     1
93070     0
25250     0
56977     1
104397    0
Name: llovieron_hamburguesas_al_dia_siguiente, Length: 10618, dtype: int64

In [63]:
y_pred_proba

array([0.2, 0.2, 0.4, ..., 0.2, 0.4, 0.2])