# Data wrangling

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("python-ml-course-master/datasets/customer-churn-model/Customer Churn Model.txt")

In [5]:
data.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


## Crear subconjuntos de datos

### Subconjunto de columnas

In [6]:
account_length = data["Account Length"]

In [7]:
account_length.head()

0    128
1    107
2    137
3     84
4     75
Name: Account Length, dtype: int64

In [8]:
type(account_length)

pandas.core.series.Series

In [9]:
subset = data[["Account Length", "Phone", "Eve Charge", "Day Calls"]]

In [10]:
subset.head()

Unnamed: 0,Account Length,Phone,Eve Charge,Day Calls
0,128,382-4657,16.78,110
1,107,371-7191,16.62,123
2,137,358-1921,10.3,114
3,84,375-9999,5.26,71
4,75,330-6626,12.61,113


In [11]:
type(subset)

pandas.core.frame.DataFrame

In [12]:
desired_columns = ["Account Length", "Phone", "Eve Charge", "Night Calls"]
subset = data[desired_columns]
subset.head()

Unnamed: 0,Account Length,Phone,Eve Charge,Night Calls
0,128,382-4657,16.78,91
1,107,371-7191,16.62,103
2,137,358-1921,10.3,104
3,84,375-9999,5.26,89
4,75,330-6626,12.61,121


In [13]:
# Todas las columnas que no son las desired_columns
desired_columns = ["Account Length", "VMail Message", "Night Calls"]
all_colum_list = data.columns.values.tolist()

sublist = data[[x for x in all_colum_list if x not in desired_columns]]
sublist.head()

Unnamed: 0,State,Area Code,Phone,Int'l Plan,VMail Plan,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,415,382-4657,no,yes,265.1,110,45.07,197.4,99,16.78,244.7,11.01,10.0,3,2.7,1,False.
1,OH,415,371-7191,no,yes,161.6,123,27.47,195.5,103,16.62,254.4,11.45,13.7,3,3.7,1,False.
2,NJ,415,358-1921,no,no,243.4,114,41.38,121.2,110,10.3,162.6,7.32,12.2,5,3.29,0,False.
3,OH,408,375-9999,yes,no,299.4,71,50.9,61.9,88,5.26,196.9,8.86,6.6,7,1.78,2,False.
4,OK,415,330-6626,yes,no,166.7,113,28.34,148.3,122,12.61,186.9,8.41,10.1,3,2.73,3,False.


### Subconjunto de filas

In [14]:
data[:3]

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.


#### Condiciones booleanas

In [21]:
# Usuarios con Day Mins > 300
data1 = data[data["Day Mins"]>300]
data1.shape

(43, 21)

In [20]:
# Usuarios de New York
data2 = data[data['State'] == "NY"]
data2.shape

(83, 21)

In [23]:
# Minutos de día, de noche y Longitud de la Cuenta de los primeros 50 individuos
subset_first_50 = data[["Day Mins", "Night Mins", "Account Length"]][:50]
subset_first_50.head()

Unnamed: 0,Day Mins,Night Mins,Account Length
0,265.1,244.7,128
1,161.6,254.4,107
2,243.4,162.6,137
3,299.4,196.9,84
4,166.7,186.9,75


### Filtrado con ix -> iloc y loc
```
iloc -> posicion
loc -> clave o etiqueta
```

In [29]:
# data.ix[1:10, 3:6]
data.iloc[1:10, 3:6]

Unnamed: 0,Phone,Int'l Plan,VMail Plan
1,371-7191,no,yes
2,358-1921,no,no
3,375-9999,yes,no
4,330-6626,yes,no
5,391-8027,yes,no
6,355-9993,no,yes
7,329-9001,yes,no
8,335-4719,no,no
9,330-8173,yes,yes


In [32]:
# Todas las filas para las columnas entre 3 y 6
data.iloc[:, 3:6].shape

(3333, 3)

In [33]:
# Subconjunto de columnas
data.iloc[1:10, [2,5,7]].shape

(9, 3)

In [34]:
data.loc[[1,5,9,16], ["Area Code", "Day Mins"]]

Unnamed: 0,Area Code,Day Mins
1,415,161.6
5,510,223.4
9,415,258.6
16,408,196.4


### Insertar nuevas filas en el dataframe

In [35]:
data["Total Mins"] = data["Day Mins"] + data["Night Mins"] + data["Eve Mins"]

In [38]:
data["Total Mins"].head()

0    707.2
1    611.5
2    527.2
3    558.2
4    501.9
Name: Total Mins, dtype: float64

In [40]:
data["Total Calls"] = data["Day Calls"] + data["Night Calls"] + data["Eve Calls"]

In [41]:
data.shape

(3333, 23)

## Generación de números aleatorios

In [42]:
import numpy as np

In [43]:
# Generar un numero aleatorio entre 1 y 100
np.random.randint(1,100)

67

In [45]:
# Generar un numero aleatorio entre 0 y 1 (con decimales)
np.random.random()

0.5967153028164074

In [46]:
# Función que genera una lista de n números aleatorios enteros dentro del intervalo [a,b]
def randint_list(n, a, b):
    x = []
    for i in range(n):
        x.append(np.random.randint(a, b))
    return x    

In [47]:
randint_list(5, 1, 200)

[56, 112, 117, 64, 113]

In [50]:
# La funcion anterior esta en el paquete random
import random
for i in range(10):
    print(random.randrange(1, 100, 7))

29
1
36
15
36
50
71
8
85
8


### Shuffling

In [55]:
a = np.arange(20)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [56]:
np.random.shuffle(a)
a

array([ 5, 13, 16, 18, 11,  1, 17,  9, 14,  7, 12,  8,  3,  4, 10,  6, 19,
        2,  0, 15])

### Choice

In [58]:
data.shape

(3333, 23)

In [59]:
col_list = data.columns.values.tolist()
col_list

['State',
 'Account Length',
 'Area Code',
 'Phone',
 "Int'l Plan",
 'VMail Plan',
 'VMail Message',
 'Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'Churn?',
 'Total Mins',
 'Total Calls']

In [62]:
np.random.choice(col_list)

'Day Calls'

### Seed

In [69]:
# Semilla hace que los numeros random sean iguales usando la misma semilla
np.random.seed(2018)
for i in range(5):
    print(np.random.random())

0.8823493117539459
0.10432773786047767
0.9070093335163405
0.3063988986063515
0.446408872427422
