## Ejemplo principio a fin de trabajo con Pandas

* Profesor : [Daniel Jiménez](https://www.danieljimenezm.com/)
* Institución: [Universidad Nacional de Colombia](https://unal.edu.co/)

__Objetivo:__ En este notebook se desarrollará un ejercicio de principo a fin con pandas y matplotlib para ver el potencial que tienen estos frameworks

In [None]:
## Librerias necesarias
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import warnings
warnings.filterwarnings("ignore")

pokemon = pd.read_csv('https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv')
pokemon = pokemon.drop(['#'],axis=1)

## Validando la estructura de los datos

In [None]:
print('Validando la cantidad de datos')
print('='*32)
print(pokemon.shape)
print('='*32)

In [None]:
print('Validando la estructura de los datos')
print('='*32)
print(pokemon.info())
print('='*32)

In [None]:
print('Validando el comportamienton general de los datos')
print('='*64)
print(pokemon.describe())
print('='*64)

In [None]:
print('Validando la estructura de los datos ')
print('='*32)
print(pokemon['Name'].dtype)
print('='*32)

## Se exploraran los datos faltantes de la base de datos de manera ordenada

In [None]:
print('='*32)
print('Para ver la cantidad de datos faltantes')
print('='*32)
print(pokemon.isnull().sum().sort_values(ascending=False))
print('='*32)

In [None]:
## Visualización de datos faltantes
pokemon.isnull().sum().sort_values(ascending=False).plot(kind='bar')

In [None]:
print('='*64)
print('Para ver la cantidad de datos faltantes a nivel %')
print('='*64)
print((pokemon.isnull().mean()*100).sort_values(ascending=False))
print('='*64)

## Concatenando variables

In [None]:
type_of_combined = pokemon['Type 1'].str.cat(pokemon['Type 2'],sep='-',na_rep='nan')
type_of_combined

## Creando una nueva variable 

In [None]:
pokemon['Type_of_conc'] = type_of_combined
pokemon.head()

## ¿Cuál es la diferencia en el Attack dado el Type 1 del pokemon?

In [None]:
pokemon['Type 1'].value_counts(ascending=False)

In [None]:
pokemon['Type 1'].value_counts(ascending=False).plot(kind='bar')

In [None]:
pokemon['Type 1'].value_counts(ascending=False, normalize=True)

In [None]:
(pokemon['Type 1'].value_counts(ascending=False, normalize=True)*100).plot(kind='bar').yaxis.set_major_formatter(mtick.PercentFormatter())


In [None]:
## Entendiendo las diferencias puntuales por grupos
import seaborn as sns
plt.figure(figsize=(20,10))
sns.boxplot(x = "Type 1", y = "Attack",
            data = pokemon) 

In [None]:
## Ordenando la distribución 
import seaborn as sns
plt.figure(figsize=(20,10))
my_order = pokemon.groupby(['Type 1'])['Attack'].median().iloc[::-1].sort_values(ascending=False).index
sns.boxplot(x = "Type 1", 
            y = "Attack",
            data = pokemon, order=my_order) 
plt.title('Distribución del attack dada el Type of del pokemon')
plt.ylabel('Nivel del Attack')
plt.xlabel('Type of Pokemon')

In [None]:
## Una forma más elegante de ver esto sería 

plt.figure(figsize=(20,10))
my_order = pokemon.groupby(['Type 1'])['Attack'].median().iloc[::-1].sort_values(ascending=False).index
sns.boxplot(y = "Type 1", 
            x = "Attack",
            data = pokemon, order=my_order) 
plt.title('Distribución del attack dada el Type of del pokemon')
plt.xlabel('Nivel del Attack')
plt.ylabel('Type of Pokemon')

In [None]:
#sns.histplot(data=penguins, x="flipper_length_mm", hue="species")
plt.figure(figsize=(20,10))
sns.histplot(data=pokemon,x='Attack', hue='Type 1')


In [None]:
sns.jointplot(x='Defense', 
              y='Speed', 
              data=pokemon, 
              color ='green', 
              kind ='scatter', 
              hue='Type 1',
              size = 8.0)

In [None]:
sns.jointplot(x='Defense', 
              y='Speed', 
              data=pokemon, 
              color ='orange', 
              kind ='hex', 
              size = 8.0)

In [None]:
plt.figure(figsize=(20,10))
g = sns.FacetGrid(pokemon, col='Type 1', col_wrap=3,sharex=True,sharey=True)
g.map(plt.hist,'Attack')

In [None]:
pokemon.groupby(['Type 1'])['Type 2'].value_counts()

In [None]:
## Tablas cruzadas
pd.crosstab(pokemon['Type 1'],pokemon['Generation'])

In [None]:
table = pd.crosstab(pokemon['Type 1'],pokemon['Generation'])
table.plot(kind='bar')
plt.show()

In [None]:
from matplotlib.pyplot import figure
plt.rcParams["figure.figsize"] = (20,10)
table.plot(kind='bar',stacked = True)

## Busqueda de valores anómalos

In [None]:
import numpy as np
def outlier_check(data):

   
    M = max(data)
    m, s = np.mean(data), np.std(data)
    
   
    L, H = m-2*s, m+2*s
    
    
    f, ax = plt.subplots()
    f.set_figheight(5)
    f.set_figwidth(5)
    ax.set_ylim([0,0.025])
    ax.set_xlim([0,M])
    ax.set_title('"{}" outlier detection'.format(data.name))
    
    
    ax.vlines(H, 0, 0.025, color='red', linestyle='dashed')
    ax.fill_between(x=[H,M], y1=0.025, color='red', alpha=.05)
    
    
    ax.vlines(L, 0, 0.025, color='red', linestyle='dashed')
    ax.fill_between(x=[0,L], y1=0.025, color='red', alpha=.05)
    
    
    sns.distplot(data, ax=ax)
    
    
    return data[(data<L) | (data>H)].index

In [None]:
stats = pokemon.columns[4:-3]
pokemon['Outlier'] = np.zeros((len(pokemon),1))
for var in stats:
    pokemon.loc[outlier_check(pokemon[var]),'Outlier'] = 1