# Analitica con Numpy, Pandas y ScikitLearn

In [None]:
from pokeapi import Pokemon, render
import plotly.offline as py
from plotly import graph_objs as go
import numpy as np
import pandas as pd

py.init_notebook_mode()

In [None]:
pokemons = [ Pokemon.catch(i) for i in range(1, 500+1) ]
for pokemon in pokemons:
    pokemon["moves"] = [ move["move"]["name"] for move in pokemon["moves"] ]

## Numpy
Numpy es...

In [None]:
import numpy as np

weights = [ pokemon["weight"] for pokemon in pokemons ]
weights = np.asarray(weights, dtype=np.float32)

max_weight = np.max(weights)
mas_pesado = np.argmax(weights)
mas_pesado = pokemons[mas_pesado]

print("Peso: {}".format(max_weight))
render(mas_pesado)


min_weight = np.min(weights)
mas_liviano = np.argmin(weights)
mas_liviano = pokemons[mas_liviano]

print("Peso: {}".format(min_weight))
render(mas_liviano)

In [None]:
#promedio
peso_promedio = np.average(weights)
print("Peso promedio: {}".format(peso_promedio))

error_cuadratico_promedio = (weights - peso_promedio) ** 2
cercano_al_promedio = np.argmin(error_cuadratico_promedio)
cercano_al_promedio = pokemons[cercano_al_promedio]

print("Peso: {}".format(cercano_al_promedio["weight"]))
render(cercano_al_promedio)


#media
peso_medio = np.median(weights)
print("Peso medio: {}".format(peso_medio))

error_cuadratico_medio = (weights - peso_medio) ** 2
cercano_al_medio = np.argmin(error_cuadratico_medio)
cercano_al_medio = pokemons[cercano_al_medio]

print("Peso: {}".format(cercano_al_medio["weight"]))
render(cercano_al_medio)


#visualizar datos
histogram = go.Histogram(
    x = weights,
    name = "weights"
)

data = [histogram]
fig = go.Figure(data=data)

py.iplot(fig)

## Pandas

In [None]:
import pandas as pd

poke_df = pd.DataFrame(pokemons)
poke_df

In [None]:
max_weight = poke_df.weight.max()
mas_pesado = poke_df.weight.argmax()
mas_pesado = pokemons[mas_pesado]

print("Peso: {}".format(max_weight))
render(mas_pesado)


min_weight = poke_df.weight.min()
mas_liviano = poke_df.weight.argmin()
mas_liviano = pokemons[mas_liviano]

print("Peso: {}".format(min_weight))
render(mas_liviano)

In [None]:
#promedio
peso_promedio = poke_df.weight.mean()
print("Peso promedio: {}".format(peso_promedio))

error_cuadratico_promedio = (poke_df.weight - peso_promedio) ** 2
cercano_al_promedio = error_cuadratico_promedio.argmin()
cercano_al_promedio = pokemons[cercano_al_promedio]

print("Peso: {}".format(cercano_al_promedio["weight"]))
render(cercano_al_promedio)


#media
peso_medio = poke_df.weight.median()
print("Peso medio: {}".format(peso_medio))

error_cuadratico_medio = (poke_df.weight - peso_medio) ** 2
cercano_al_medio = error_cuadratico_medio.argmin()
cercano_al_medio = pokemons[cercano_al_medio]

print("Peso: {}".format(cercano_al_medio["weight"]))
render(cercano_al_medio)


#visualizar datos
histogram = go.Histogram(
    x = poke_df.weight,
    name = "weights"
)

data = [histogram]
fig = go.Figure(data=data)

py.iplot(fig)

## Scikit Learn

### regression lineal

In [None]:
data = pd.io.parsers.read_csv("data/ex1data1.txt", header=None).as_matrix()

scatter_data = go.Scatter(
    x = data[:, 0],
    y = data[:, 1],
    mode = "markers",
    name = "data"
)

layout = go.Layout(
    title = 'Poblacion vs Ventas',
    yaxis = dict(
        zeroline = False,
        title = "Ventas"
    ),
    xaxis = dict(
        zeroline = False,
        title = "Poblacion"
    ),
)

fig = dict(data=[scatter_data], layout=layout)
py.iplot(fig)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(data[:, :1], data[:, 1])

line_x = np.asarray([[5], [23]])
line_y = model.predict(line_x)

linear_reg_line = go.Scatter(
    x = line_x[:, 0],
    y = line_y,
    mode = "lines",
    name = "model"
)

fig = dict(data=[scatter_data, linear_reg_line], layout=layout)
py.iplot(fig)

### regresion lineal multivariada

In [None]:
data2 = pd.read_csv("data/ex1data2.txt", header=None)

scatter_data2 = go.Scatter3d(
    x = data2[0],
    y = data2[1],
    z = data2[2],
    mode = "markers",
    name = "data"
)

layout = go.Layout(
    title = 'Poblacion vs Ventas',
    yaxis = dict(
        zeroline = False,
        title = "Ventas"
    ),
    xaxis = dict(
        zeroline = False,
        title = "Poblacion"
    ),
)

fig = dict(data=[scatter_data2], layout=layout)
py.iplot(fig)

In [None]:
model2 = LinearRegression()
model2.fit(data2[[0, 1]], data2[2])

x2 = np.arange(500, 4500, 50)
y2 = np.arange(0, 5, 0.1)
xx, yy = np.meshgrid(x2, y2)

features = np.hstack((np.expand_dims(xx.flatten(), 1), np.expand_dims(yy.flatten(), 1)))
zz = model2.predict(features)
zz = zz.reshape(xx.shape)

linear_reg_line2 = go.Surface(
    x = xx,
    y = yy,
    z = zz,
    name = "model"
)

fig = dict(data=[scatter_data2, linear_reg_line2])
py.iplot(fig)

### regresion polinomica

In [None]:
data3 = pd.read_csv("data/ex2data1.txt", header=None, names=["exam1", "exam2", "passed"])

scatter_0 = go.Scatter(
    x = data3[data3.passed == 0].exam1,
    y = data3[data3.passed == 0].exam2,
    mode = "markers",
    name = "failed",
    marker = dict(
        color = "red"
    )
)

scatter_1 = go.Scatter(
    x = data3[data3.passed == 1].exam1,
    y = data3[data3.passed == 1].exam2,
    mode = "markers",
    name = "passed",
    marker = dict(
        color = "blue"
    )
)

layout = go.Layout(
    title = 'Poblacion vs Ventas',
    yaxis = dict(
        zeroline = False,
        title = "Ventas"
    ),
    xaxis = dict(
        zeroline = False,
        title = "Poblacion"
    ),
)

fig = dict(data=[scatter_0, scatter_1], layout=layout)
py.iplot(fig)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

features = data3[["exam1", "exam2"]]
labels = data3["passed"]

model3 = LogisticRegression()
model3.fit(features, labels)


x3, y3 = np.meshgrid(np.arange(30, 101, 0.5), np.arange(30, 101, 0.5))
flat_mesh3 = np.hstack((np.expand_dims(x3.flatten(), 1), np.expand_dims(y3.flatten(), 1)))
z3 = model3.predict(flat_mesh3)
z3 = z3.reshape(x3.shape)

frontera3 = go.Heatmap(
    x = x3[0, :],
    y = y3[:, 0],
    z = -z3,
    name = "model"
)

fig = dict(data=[frontera3, scatter_0, scatter_1])
py.iplot(fig)

### naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

features = data3[["exam1", "exam2"]]
labels = data3["passed"]

model3 = GaussianNB()
model3.fit(features, labels)

x3, y3 = np.meshgrid(np.arange(30, 101, 0.5), np.arange(30, 101, 0.5))
flat_mesh3 = np.hstack((np.expand_dims(x3.flatten(), 1), np.expand_dims(y3.flatten(), 1)))
z3 = model3.predict(flat_mesh3)
z3 = z3.reshape(x3.shape)

frontera3 = go.Heatmap(
    x = x3[0, :],
    y = y3[:, 0],
    z = -z3,
    name = "model"
)

fig = dict(data=[frontera3, scatter_0, scatter_1])
py.iplot(fig)

### random forrest

In [None]:
from sklearn.ensemble import RandomForestClassifier

features = data3[["exam1", "exam2"]]
labels = data3["passed"]

model3 = RandomForestClassifier(min_samples_split=10, n_estimators=50)
model3.fit(features, labels)

x3, y3 = np.meshgrid(np.arange(30, 101, 0.5), np.arange(30, 101, 0.5))
flat_mesh3 = np.hstack((np.expand_dims(x3.flatten(), 1), np.expand_dims(y3.flatten(), 1)))
z3 = model3.predict(flat_mesh3)
z3 = z3.reshape(x3.shape)

frontera3 = go.Heatmap(
    x = x3[0, :],
    y = y3[:, 0],
    z = -z3,
    name = "model"
)

fig = dict(data=[frontera3, scatter_0, scatter_1])
py.iplot(fig)

### support vector machine

In [None]:
from sklearn.svm import SVC

features = data3[["exam1", "exam2"]]
labels = data3["passed"]

model3 = SVC(gamma=0.005, C=100)
model3.fit(features, labels)

x3, y3 = np.meshgrid(np.arange(30, 101, 0.5), np.arange(30, 101, 0.5))
flat_mesh3 = np.hstack((np.expand_dims(x3.flatten(), 1), np.expand_dims(y3.flatten(), 1)))
z3 = model3.predict(flat_mesh3)
z3 = z3.reshape(x3.shape)

frontera3 = go.Heatmap(
    x = x3[0, :],
    y = y3[:, 0],
    z = -z3,
    name = "model"
)

fig = dict(data=[frontera3, scatter_0, scatter_1])
py.iplot(fig)

### regression logistica polinomica

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

features = data3[["exam1", "exam2"]]
labels = data3["passed"]

poly = PolynomialFeatures(degree=2)
features = poly.fit_transform(features)

model3 = LogisticRegression()
model3.fit(features, labels)


x3, y3 = np.meshgrid(np.arange(30, 101, 0.5), np.arange(30, 101, 0.5))
flat_mesh3 = np.hstack((np.expand_dims(x3.flatten(), 1), np.expand_dims(y3.flatten(), 1)))
flat_mesh3 = poly.fit_transform(flat_mesh3)
z3 = model3.predict(flat_mesh3)
z3 = z3.reshape(x3.shape)

frontera3 = go.Heatmap(
    x = x3[0, :],
    y = y3[:, 0],
    z = -z3,
    name = "model"
)

fig = dict(data=[frontera3, scatter_0, scatter_1])
py.iplot(fig)

## Reduccion de Dimensiones

In [None]:
houses = np.loadtxt('data/house16H.dat', comments='@', delimiter=',')

Esta base de datos fue diseñada sobre la base de los datos proporcionados por la Oficina del Censo de los Estados Unidos http://www.census.gov. Los datos se recolectaron como parte del censo de 1990 en los Estados Unidos. Estos son en su mayoría recuentos acumulados en diferentes niveles de la encuesta.

In [None]:
houses.shape

In [None]:
from sklearn.decomposition import PCA
import random

houses_compact = PCA(n_components=3).fit_transform(houses)
houses_compact = houses_compact[random.sample(range(houses_compact.shape[0]), 1000)]

scatter_houses = dict(
    mode = "markers",
    type = "scatter3d",
    marker = dict( size=2 ),
    x = houses_compact[:, 0], y = houses_compact[:, 1], z = houses_compact[:, 2]
)
cluster_houses = dict(
    alphahull = 7,
    opacity = 0.1,
    type = "mesh3d",    
    x = houses_compact[:, 0], y = houses_compact[:, 1], z = houses_compact[:, 2]
)

layout = dict(
    title = '3d point clustering',
    scene = dict(
        xaxis = dict( zeroline=False ),
        yaxis = dict( zeroline=False ),
        zaxis = dict( zeroline=False ),
    ),
    height = 600, width = 800
)
fig = dict( data= [cluster_houses, scatter_houses], layout=layout )
# Use py.iplot() for IPython notebook
py.iplot(fig, filename='Iris Dataset')

In [None]:
houses_compact