In [1]:
%matplotlib notebook

In [2]:
import pyspark

sc = pyspark.sql.SparkSession.Builder().getOrCreate()

## Levanto los archivos

In [3]:
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from IPython.display import display, HTML
import math

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.autograd import Variable

from utils.emissions_normalizer import EmissionsNormalizer
from utils.neural_network_predictor import NeuralNetworkPredictor



In [5]:
points_recep = sc.read.json('datos/train-test-by-emission.jsonlines/').rdd

## Utils

In [6]:
def distance(p1, p2):
    return math.sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)

def get_regressor_mae(predictions, real):
    mae = 0
    for i in range(len(predictions)):
        mae += distance(predictions[i], real[i])
    mae = mae/len(predictions)
    return mae


global dict_coordenadas
dict_coordenadas = points_recep.map(lambda x: (x['Punto'],(x['x'], x['y']))).collectAsMap()

def get_classifier_mae(predictions, real):
    sum_error = 0
    count = 0
    for i in range(len(predictions)):
        if predictions[i] not in dict_coordenadas:
            print('predicted point dont exist {}'.format(predictions[i]))
            continue
        pred_position = dict_coordenadas[predictions[i]]
        real_position = dict_coordenadas[real[i]]
        sum_error += distance(pred_position, real_position)
        count += 1
    return sum_error/count


In [7]:
normalizer = EmissionsNormalizer()
data = normalizer.normalize(points_recep)
regre_data, regre_target = normalizer.get_regression_dataframes(data)


## Redes neuronales para regresion

In [8]:
MODEL_PATH = 'datos/tmp/nn-model-2h'
def k_cross_validation(predictor, data, target, k=5, init_from_file = False, save_in_file = False):   
    kf = KFold(n_splits=k, shuffle=True)
    mae_list = []
    mae_list_train = []
    k_index = 0
        
    for train_index, test_index in kf.split(data):
        predictor.clear()
        if init_from_file:
            predictor.open_model(MODEL_PATH)

        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        
        predictor.fit(X_train, X_test)

        
        predicted = predictor.predict(X_test)
        mae = get_regressor_mae(predictions=predicted, real=y_test)
        print(mae)
             
        predicted_train = predictor.predict(X_train)
        mae_train = get_regressor_mae(predictions=predicted_train, real=y_train)
        print(mae_train)
        
        mae_list.append(mae)
        mae_list_train.append(mae_train)
        if save_in_file: predictor.save_model(MODEL_PATH)

    return mae_list, mae_list_train


In [9]:
k_cross_validation(NeuralNetworkPredictor(), regre_data, regre_target, k=5, init_from_file=False, save_in_file=False)


KeyError: "None of [Int64Index([   0,    1,    3,    4,    5,    6,    8,    9,   10,   11,\n            ...\n            5432, 5433, 5435, 5437, 5438, 5439, 5440, 5441, 5442, 5444],\n           dtype='int64', length=4356)] are in the [columns]"

## Redes neuronales para clasificacion

In [None]:
classi_target = pd.DataFrame(all_emissions.map(lambda x: x['point']).collect())
classi_data = pd.DataFrame(all_emissions.map(lambda x: x['data']).collect())


classi_data_np = np.array(classi_data.astype(float))
classi_target_np = np.array(classi_target.astype(int))


X_train, X_test, y_train, y_test = train_test_split(classi_data_np, classi_target_np, test_size=15)


In [None]:
input_dim = 4
output_dim = 543

model = LinearClassificationModel(input_dim,output_dim)


#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()


l_rate = 0.01
optimiser = torch.optim.SGD(model.parameters(), lr = l_rate) #Stochastic Gradient Descent
#optimizer = torch.optim.Adam(model.parameters(),lr=l_rate,weight_decay=1e-4)


epochs = 50

In [None]:
for epoch in range(epochs):

    epoch +=1
    #increase the number of epochs by 1 every time
    
    inputs = Variable(torch.Tensor(X_train), requires_grad=False)
    labels = Variable(torch.Tensor(y_train).long(), requires_grad=False)
    

    #clear grads as discussed in prev post
    optimiser.zero_grad()
    #forward to get predicted values
    outputs = model.forward(inputs)
    loss = criterion(outputs, labels.view(-1))
    loss.backward()# back props
    optimiser.step()# update the parameters
    print('epoch {}, loss {}'.format(epoch,loss.item()))

In [None]:
test = Variable(torch.Tensor(X_test), requires_grad=False)
predicted_proba = model.forward(test)
predicted_proba = predicted_proba.exp().detach().data.numpy()

In [None]:
predicted = []
for probas in predicted_proba:
    point = np.argmax(probas)
    predicted.append(point)

get_classifier_mae(predictions=predicted, real=y_test.ravel())

In [None]:
classi_target_np[20]

In [None]:
np.where(np.unique(classi_target_np, axis=0) == classi_target_np[20])

In [None]:
def k_cross_validation_classi(model_builder, data, target, r_target_np, k=5):   
    kf = KFold(n_splits=k, shuffle=True)
    mae_list = []
    mae_list_train = []
    k_index = 0
    
    unique_target = np.unique(target, axis=0)
    groups = []
    for h in range(len(target)):
        i,j = np.where(unique_target == target[h])
        groups.append(i[0])
    
    for train_index, test_index in kf.split(data, groups=groups):
        regressor = KNeighborsRegressor()
        params = model_builder()
        model = params['model']
        l_rate = params['l_rate']
        optimiser = params['optimiser']
        criterion = params['criterion']
        epochs = params['epochs']

        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        
        for epoch in range(epochs):

            epoch +=1
            #increase the number of epochs by 1 every time

            inputs = Variable(torch.Tensor(X_train), requires_grad=False)
            labels = Variable(torch.Tensor(y_train).long(), requires_grad=False)


            #clear grads as discussed in prev post
            optimiser.zero_grad()
            #forward to get predicted values
            outputs = model.forward(inputs)
            loss = criterion(outputs, labels.view(-1))
            loss.backward()# back props
            optimiser.step()# update the parameters
            if k_index == 0 and epoch < 1000: print('epoch {}, loss {}'.format(epoch,loss.item()))
            if k_index == 0 and epoch > 1000 and random.randint(0, 100) == 1: print('epoch {}, loss {}'.format(epoch,loss.item()))

        print(loss.data)
        
        test = Variable(torch.Tensor(X_test), requires_grad=False)
        predicted_proba = model.forward(test)
        predicted_proba = predicted_proba.exp().detach().data.numpy()

        label_list = model.forward(Variable(torch.Tensor(X_train), requires_grad=False)).exp().detach().data.numpy()
        regressor.fit(label_list, r_target_np[train_index])

        c_predictions = model.forward(Variable(torch.Tensor(X_test), requires_grad=False)).exp().detach().data.numpy()
        r_predictions = regressor.predict(c_predictions)
        mae = get_regressor_mae(r_predictions, r_target_np[test_index])
        mae_list.append(mae)
        '''
        predicted = []
        for probas in predicted_proba:
            point = np.argmax(probas)
            predicted.append(point)
        KNeighborsRegressor()

        mae = get_classifier_mae(predictions=predicted, real=y_test.ravel())
        print(mae)
        
        train = Variable(torch.Tensor(X_train), requires_grad=False)
        predicted_proba_train = model.forward(train)
        predicted_proba_train = predicted_proba_train.exp().detach().data.numpy()
        
        predicted_train = []
        for probas in predicted_proba_train:
            point = np.argmax(probas)
            predicted_train.append(point)

        mae_train = get_classifier_mae(predictions=predicted_train, real=y_train.ravel())
        
        mae_list.append(mae)
        mae_list_train.append(mae_train)
        '''
        k_index += 1

    return mae_list, mae_list_train


In [None]:
def build_classi_nn_params():
    model = LinearClassificationModel(4, 543)
    #criterion = nn.CrossEntropyLoss()
    criterion = nn.NLLLoss()
    l_rate = 0.01
    optimiser = torch.optim.SGD(model.parameters(), lr = l_rate) #Stochastic Gradient Descent
    #optimizer = torch.optim.Adam(model.parameters(),lr=l_rate,weight_decay=1e-4)
    epochs = 1000
    
    return {
        'model': model,
        'l_rate': l_rate,
        'optimiser': optimiser,
        'criterion': criterion,
        'epochs': epochs
    }

In [None]:
k_cross_validation_classi(build_classi_nn_params, classi_data_np, classi_target_np, regre_target_np, k=5)