<a href="https://colab.research.google.com/github/alejandroariaszuluaga/sinfonia/blob/master/reto3_SinfonIA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Reto 3: Convocatoria SinfonÍA**

**Autor:** Alejandro Arias

**Código:** 201711999

**Correo de contacto:** a.ariasz@uniandes.edu.co

A continuación se implementan principalmente dos alternativas para resolver el siguiente problema de clasificación binaria:

> Desarrollar un modelo que reciba un vector de características y retorne la etiqueta, indicando si hay o no una persona.

A continuación se importan los datos necesarios, los cuales fueron obtenidos desde el repositorio: https://github.com/carolinahiguera/ConvocatoriaSinfonIAUniandes.git


In [19]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28']

xTrain_df = pd.read_csv('/content/drive/My Drive/ConvocatoriaSinfonIAUniandes/reto3/DataTrain.csv', names=cols)
yTrain_df = pd.read_csv('/content/drive/My Drive/ConvocatoriaSinfonIAUniandes/reto3/LabelsTrain.csv', names=['y'])

xTrain, xVal, yTrain, yVal = train_test_split(xTrain_df.values, yTrain_df.values, test_size = 0.2)
yTrain = np.ravel(yTrain)
yVal = np.ravel(yVal)

# Regresión Logística

En primer lugar, se realizó una prueba preliminar aplicando un modelo de regresión logística.

In [21]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(xTrain, yTrain)
clf.score(xVal, yVal)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9030470914127424

# **Redes Neuronales**

Posteriormente, se probaron varias arquitecturas de redes neuronales, tomando un rango desde 2 hasta 200 neuronas por cada capa. Igualmente, se analizó un rango desde 2 hasta 5 capas escondidas.

Se utilizaron los parámetros:

*   Función de activación: ReLu
*   Optimizador: *Adam*


In [23]:
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)

def red_neuronal(xTrain, yTrain, xVal, yVal, n, capas):
  
  clf = MLPClassifier(activation='relu', solver='sgd', hidden_layer_sizes=(n,capas))
  clf.fit(xTrain, yTrain)
  
  pred_val_Y = clf.predict(xVal)
  pred_train_Y = clf.predict(xTrain)
  
  train_ac = accuracy_score(yTrain, pred_train_Y)
  val_ac = accuracy_score(yVal, pred_val_Y)
  return val_ac, train_ac

**Red Neuronal 2 Capas**

In [None]:
Nmin = 2
Nmax = 200
capas = 2
n_max_ac = 0
max_ac = 0
vector_val_ac = []
vector_train_ac = []
for n in range(Nmin,Nmax):
  val_ac, train_ac = red_neuronal(xTrain, yTrain, xVal, yVal, n, capas)
  
  vector_train_ac.append(train_ac)
  vector_val_ac.append(val_ac)
  if val_ac>max_ac:
    max_ac = val_ac
    n_max_ac = n
    
print("Precisión a partir de un entrenamiento con:")
print("->",n_max_ac,"neuronas. Validación:", max_ac)

fig, ax = plt.subplots()
ax.plot(range(Nmin,Nmax), vector_val_ac, label='Validación')
ax.plot(range(Nmin,Nmax), vector_train_ac, label='Entrenamiento')
leg = ax.legend();

plt.ylim(0,1)
plt.xlabel('Número de neuronas')
plt.ylabel('Precisión con datos de validación')

**Red Neuronal 3 Capas**

In [None]:
Nmin = 2
Nmax = 200
capas = 3
n_max_ac = 0
max_ac = 0
vector_val_ac = []
vector_train_ac = []
for n in range(Nmin,Nmax):
  val_ac, train_ac = red_neuronal(xTrain, yTrain, xVal, yVal, n, capas)
  
  vector_train_ac.append(train_ac)
  vector_val_ac.append(val_ac)
  if val_ac>max_ac:
    max_ac = val_ac
    n_max_ac = n
    
print("Precisión a partir de un entrenamiento con:")
print("->",n_max_ac,"neuronas. Validación:", max_ac)

fig, ax = plt.subplots()
ax.plot(range(Nmin,Nmax), vector_val_ac, label='Validación')
ax.plot(range(Nmin,Nmax), vector_train_ac, label='Entrenamiento')
leg = ax.legend();

plt.ylim(0,1)
plt.xlabel('Número de neuronas')
plt.ylabel('Precisión con datos de validación')

**Red Neuronal 4 Capas**

In [None]:
Nmin = 2
Nmax = 200
capas = 4
n_max_ac = 0
max_ac = 0
vector_val_ac = []
vector_train_ac = []
for n in range(Nmin,Nmax):
  val_ac, train_ac = red_neuronal(xTrain, yTrain, xVal, yVal, n, capas)
  
  vector_train_ac.append(train_ac)
  vector_val_ac.append(val_ac)
  if val_ac>max_ac:
    max_ac = val_ac
    n_max_ac = n
    
print("Precisión a partir de un entrenamiento con:")
print("->",n_max_ac,"neuronas. Validación:", max_ac)

fig, ax = plt.subplots()
ax.plot(range(Nmin,Nmax), vector_val_ac, label='Validación')
ax.plot(range(Nmin,Nmax), vector_train_ac, label='Entrenamiento')
leg = ax.legend();

plt.ylim(0,1)
plt.xlabel('Número de neuronas')
plt.ylabel('Precisión con datos de validación')

**Red Neuronal 5 Capas**

In [None]:
Nmin = 2
Nmax = 200
capas = 5
n_max_ac = 0
max_ac = 0
vector_val_ac = []
vector_train_ac = []
for n in range(Nmin,Nmax):
  val_ac = red_neuronal(xTrain, yTrain, xVal, yVal, n, capas)
  
  #vector_train_ac.append(train_ac)
  vector_val_ac.append(val_ac)
  if val_ac>max_ac:
    max_ac = val_ac
    n_max_ac = n
    
print("Precisión a partir de un entrenamiento con:")
print("->",n_max_ac,"neuronas. Validación:", max_ac)

fig, ax = plt.subplots()
ax.plot(range(Nmin,Nmax), vector_val_ac, label='Validación')
ax.plot(range(Nmin,Nmax), vector_train_ac, label='Entrenamiento')
leg = ax.legend();

plt.ylim(0,1)
plt.xlabel('Número de neuronas')
plt.ylabel('Precisión con datos de validación')

# Support Vector Machine

Finalmente, se analizaron los resultados a partir de un clasificador tipo SVM. Se realizó una principal variación del parámetro de regularización (C), el cual se relaciona de forma inversa a la complejidad del modelo (previniendo *overfitting*).

In [None]:
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)

def svm(xTrain, yTrain, xVal, yVal, exp_C):
  sv_clf = SVC(C = 10**(exp[exp_C]), kernel = 'rbf', gamma = 'auto')
  sv_clf.fit(xTrain, yTrain)

  pred_trainY = sv_clf.predict(xTrain)
  pred_valY = sv_clf.predict(validationX)
  
  train_ac = accuracy_score(yTrain, pred_train_Y)
  val_ac = accuracy_score(yVal, pred_val_Y)
  return val_ac, train_ac

In [None]:
from sklearn.svm import SVC

train_ac = []
val_ac = []
max_val_ac = 0
max_val_ac_C = 0
C_min = -3
C_max = 3

exp = np.linspace(C_min, C_max, 50)

for i in tqdm(range(len(exp))):
  prec_val, prec_train = svm(xTrain, yTrain, xVal, yVal, i)

  train_ac.append(prec_train)
  val_ac.append(prec_val)
  if prec_val > max_val_ac:
    max_val_ac = prec_val
    sv_best_clf = sv_clf
    max_val_ac_C = exp[i]

print("Precisión a partir de un entrenamiento con:")
print("->Regularización C:",max_val_ac_C,". Validación:", max_val_ac)

fig, ax = plt.subplots()
ax.plot(exp, train_ac, label='Entrenamiento')
ax.plot(exp, val_ac, label='Validación')
ax.legend()
plt.xlabel("Regularización, log_10(C)")
plt.ylabel("Precisión")
plt.title("Entrenamiento de modelo SVM con kernel gaussiano")