# Data wrangling

In [3]:
import pandas as pd
import numpy as np
import os

mainpath="/Users/pabloarranzropero/workspace-python/curso-ml-udemy/datasets"
filename="customer-churn-model/Customer Churn Model.txt"

data = pd.read_csv(os.path.join(mainpath, filename))

data.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


## Crear subconjuntos de datos
Podemos extraer subconjuntos:
* Seleccionando una columna (devuelve un Series)

In [13]:
data["Account Length"].head()

0    128
1    107
2    137
3     84
4     75
Name: Account Length, dtype: int64

* Seleccionando varias columnas

In [18]:
data[["Account Length", "Phone", "Eve Charge", "Day Calls"]].head()

Unnamed: 0,Account Length,Phone,Eve Charge,Day Calls
0,128,382-4657,16.78,110
1,107,371-7191,16.62,123
2,137,358-1921,10.3,114
3,84,375-9999,5.26,71
4,75,330-6626,12.61,113


* Eliminando las columnas que no queremos.
Para esto, dependiendo de:
* Si las que queremos son menos de las que no queremos, las seleccionamos.
* Si las que no queremos son menos de las que queremos:

In [24]:
not_wanted = ['Phone','State','Area Code']
all_columns = data.columns.values.tolist()
wanted = [x for x in all_columns if x not in not_wanted]       #hacemos el complementario de las que no queremos
data[wanted].head()

Unnamed: 0,Account Length,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,137,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,84,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,75,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


* Cogiendo un subconjunto de filas

In [9]:
data[10:25]               #el ultimo indice no se incluye en el subset
data[:10]
data[-10:]

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
3323,IN,117,415,362-5899,no,no,0,118.4,126,20.13,...,97,21.19,227.0,56,10.22,13.6,3,3.67,5,True.
3324,WV,159,415,377-1164,no,no,0,169.8,114,28.87,...,105,16.8,193.7,82,8.72,11.6,4,3.13,1,False.
3325,OH,78,408,368-8555,no,no,0,193.4,99,32.88,...,88,9.94,243.3,109,10.95,9.3,4,2.51,2,False.
3326,OH,96,415,347-6812,no,no,0,106.6,128,18.12,...,87,24.21,178.9,92,8.05,14.9,7,4.02,1,False.
3327,SC,79,415,348-3830,no,no,0,134.7,98,22.9,...,68,16.12,221.4,128,9.96,11.8,5,3.19,2,False.
3328,AZ,192,415,414-4276,no,yes,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False.
3329,WV,68,415,370-3271,no,no,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False.
3330,RI,28,510,328-8230,no,no,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False.
3331,CT,184,510,364-6381,yes,no,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False.
3332,TN,74,415,400-4344,no,yes,25,234.4,113,39.85,...,82,22.6,241.4,77,10.86,13.7,4,3.7,0,False.


* Cogiendo un subconjunto de filas condicionados

In [20]:
data[data["Day Mins"] > data["Day Mins"].mean()]    #nos quedamos con los que superan la media
data[data["State"] == "NY"]
data[(data["State"] == "NY") & (data["Day Mins"] > data["Day Mins"].mean())]

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
15,NY,161,415,351-7269,no,no,0,332.9,67,56.59,...,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True.
551,NY,157,415,421-1189,no,no,0,224.5,111,38.17,...,99,17.06,116.6,118,5.25,11.5,2,3.11,2,False.
650,NY,140,415,333-8180,no,no,0,235.5,81,40.04,...,130,21.86,103.1,111,4.64,11.5,4,3.11,2,False.
678,NY,65,510,383-8878,no,no,0,195.4,110,33.22,...,109,15.4,178.5,105,8.03,8.9,4,2.4,0,False.
754,NY,41,415,393-9985,no,no,0,209.9,105,35.68,...,105,10.36,253.7,104,11.42,9.6,4,2.59,1,False.
964,NY,9,408,353-1941,no,yes,31,193.8,130,32.95,...,98,17.22,191.2,102,8.6,13.3,2,3.59,1,False.
985,NY,64,415,345-9140,yes,no,0,346.8,55,58.96,...,79,21.21,275.4,102,12.39,13.3,9,3.59,1,True.
994,NY,147,510,421-7205,no,yes,33,251.5,107,42.76,...,110,19.9,213.4,87,9.6,10.4,6,2.81,3,False.
1044,NY,94,510,417-3046,yes,no,0,243.2,109,41.34,...,88,12.5,94.9,99,4.27,7.2,4,1.94,4,False.
1065,NY,68,415,349-4762,no,yes,29,239.5,82,40.72,...,105,17.32,167.8,70,7.55,9.9,6,2.67,0,False.


* Seleccionando un subconjunto por filas y columnas a la vez

In [40]:
data[:25][['Day Mins', 'Night Mins', 'Account Length']] #es indiferente primero filas o primero columnas
data.ix[:10,3:6] #para seleccionar filas y columnas numericamente (primero filas y luego columnas)

#ix esta deprecado, podemos usar
data.iloc[:10,3:6]                                      #para seleccionar por posicion
data.loc[:10, ['Phone', 'Int\'l Plan', 'VMail Plan']]   #para seleccionar por nombre de columna

Unnamed: 0,Phone,Int'l Plan,VMail Plan
0,382-4657,no,yes
1,371-7191,no,yes
2,358-1921,no,no
3,375-9999,yes,no
4,330-6626,yes,no
5,391-8027,yes,no
6,355-9993,no,yes
7,329-9001,yes,no
8,335-4719,no,no
9,330-8173,yes,yes


## Creación de nuevas columnas
Si queremos por ejemplo los minutos de todo el día sin separar en day, eve y night

In [44]:
data['Total Mins'] = data['Day Mins'] + data['Night Mins'] + data['Eve Mins']
data['Total Mins'].head()

0    707.2
1    611.5
2    527.2
3    558.2
4    501.9
Name: Total Mins, dtype: float64

## Generación de números aleatorios
Podemos generar un numero aleatorio entero en un rango:

In [66]:
np.random.randint(1,100)

3

Pero lo más habitual es sacar un numero aleatorio entre 0 y 1:

In [74]:
np.random.random()

0.7072821226308064

## Shuffling
Reordenar aletoriamente un conjunto de datos

In [84]:
lista = np.arange(100)
np.random.shuffle(lista)
lista

array([24, 20, 82, 54,  2, 98, 92, 10, 45, 88, 67, 13,  4, 16, 78, 56, 17,
       44, 65, 90, 58, 52, 91, 93, 21, 37, 41, 39,  7, 89, 86, 73, 42, 30,
       72, 59, 55, 75, 48, 40, 76, 80, 47, 83, 95, 94, 18, 38, 36,  8, 64,
       69, 85, 61, 11, 14, 68, 87,  9, 60, 62, 77, 49, 57, 32, 66, 46, 79,
       50,  1,  3, 99, 35, 28, 15, 26,  0, 74, 19, 29, 53, 31, 81,  6, 25,
       97, 96, 43, 63, 71, 51, 84, 70, 34,  5, 27, 33, 23, 22, 12])

Para elegir algo aleatoriamente sin usar shuffle (en este caso una columna al azar)

In [101]:
np.random.choice(data.columns.values.tolist())

'VMail Message'

#### Semilla de la generación aleatoria
Si conocemos la semilla podemos reproducir la "aleatoriedad" de nuevo, ya que por mucho que ejecutemos esto va a salir siempre lo mismo.

In [123]:
np.random.seed(2018)
for i in range(5):
    print(np.random.random())

0.8823493117539459
0.10432773786047767
0.9070093335163405
0.3063988986063515
0.446408872427422
