## Resumen de los datos: dimensiones y estructuras

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
dspath = "../../datasets"
filename = "titanic/titanic3.csv"
titanic3 = os.path.join(dspath, filename)

titanic3_URL = "https://raw.githubusercontent.com/joanby/python-ml-course/master/datasets/titanic/titanic3.csv"

In [3]:
data = pd.read_csv(titanic3_URL)

In [4]:
print (data.shape)
print (data.columns.values)
data

(1309, 14)
['pclass' 'survived' 'name' 'sex' 'age' 'sibsp' 'parch' 'ticket' 'fare'
 'cabin' 'embarked' 'boat' 'body' 'home.dest']


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


## Vamos a hacer un resumen de los estadisticos básicos

In [5]:
print(data.dtypes)
data.describe()

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object


Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


## Missing Values

In [6]:
pd.isnull(data["cabin"])              # existe el inverso: notnull

0       False
1       False
2       False
3       False
4       False
        ...  
1304     True
1305     True
1306     True
1307     True
1308     True
Name: cabin, Length: 1309, dtype: bool

**ravel** convierte una *serie* (indice+datos) en un *array numérico* (sólo datos). 
No siempre se necesita, pero si se pone evitamos errores.

In [7]:
print (pd.isnull(data["body"]).values.ravel().sum())   # número de cuerpos no encontrados
print (data["cabin"].isnull().values.ravel().sum())    # número de camarotes no registrados
print (data["age"].isnull().values.ravel().sum())    # edad desconocida

1188
1014
263



Los valores que faltan en un dataset pueden venir por dos razones:
- extracción de los datos
- recolección de los datos

### Qué hacer con datos perdidos

#### Borrado de datos que faltan

**dropna**: Borra sets con NaN, index=0 borra la *fila*, index=1 borra la *columna*; de acuerdo al criterio declarado en *how*
#### Relleno de los datos que faltan
**fillna**: Sustituye los NaN con un *número* o una *palabra*. Admite ffill (proximo no nulo) y bfill (anterior no nulo)

In [8]:
data2 = data
data2.dropna(axis=0, how = "all")      # borro fila si todas sus columnas son NaN
data2.dropna(axis=0, how = "any")      # borro fila si alguna columna es NaN *me quedo sin ninguna*

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [9]:
data["body"] = data["body"].fillna(0)
data["home.dest"] = data["home.dest"].fillna("Unknown")
data["cabin"] = data["cabin"].fillna("Unknown")
data["boat"] = data["boat"].fillna("Unknown")
data["age"] = data["age"].fillna(data["age"].mean())

In [10]:
data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.000000,0,0,24160,211.3375,B5,S,2,0.0,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.916700,1,2,113781,151.5500,C22 C26,S,11,0.0,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.000000,1,2,113781,151.5500,C22 C26,S,Unknown,0.0,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.000000,1,2,113781,151.5500,C22 C26,S,Unknown,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.000000,1,2,113781,151.5500,C22 C26,S,Unknown,0.0,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.500000,1,0,2665,14.4542,Unknown,C,Unknown,328.0,Unknown
1305,3,0,"Zabour, Miss. Thamine",female,29.881135,1,0,2665,14.4542,Unknown,C,Unknown,0.0,Unknown
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.500000,0,0,2656,7.2250,Unknown,C,Unknown,304.0,Unknown
1307,3,0,"Zakarian, Mr. Ortin",male,27.000000,0,0,2670,7.2250,Unknown,C,Unknown,0.0,Unknown


## Variables Dummy 

In [11]:
data["sex"]

0       female
1         male
2       female
3         male
4       female
         ...  
1304    female
1305    female
1306      male
1307      male
1308      male
Name: sex, Length: 1309, dtype: object

In [12]:
dummy_sex =pd.get_dummies(data["sex"], prefix="sex")
dummy_sex

Unnamed: 0,sex_female,sex_male
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0
...,...,...
1304,1,0
1305,1,0
1306,0,1
1307,0,1


In [13]:
data = data.drop(["sex"], axis = 1)
data = pd.concat([data, dummy_sex], axis = 1)
data

Unnamed: 0,pclass,survived,name,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,sex_female,sex_male
0,1,1,"Allen, Miss. Elisabeth Walton",29.000000,0,0,24160,211.3375,B5,S,2,0.0,"St Louis, MO",1,0
1,1,1,"Allison, Master. Hudson Trevor",0.916700,1,2,113781,151.5500,C22 C26,S,11,0.0,"Montreal, PQ / Chesterville, ON",0,1
2,1,0,"Allison, Miss. Helen Loraine",2.000000,1,2,113781,151.5500,C22 C26,S,Unknown,0.0,"Montreal, PQ / Chesterville, ON",1,0
3,1,0,"Allison, Mr. Hudson Joshua Creighton",30.000000,1,2,113781,151.5500,C22 C26,S,Unknown,135.0,"Montreal, PQ / Chesterville, ON",0,1
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.000000,1,2,113781,151.5500,C22 C26,S,Unknown,0.0,"Montreal, PQ / Chesterville, ON",1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",14.500000,1,0,2665,14.4542,Unknown,C,Unknown,328.0,Unknown,1,0
1305,3,0,"Zabour, Miss. Thamine",29.881135,1,0,2665,14.4542,Unknown,C,Unknown,0.0,Unknown,1,0
1306,3,0,"Zakarian, Mr. Mapriededer",26.500000,0,0,2656,7.2250,Unknown,C,Unknown,304.0,Unknown,0,1
1307,3,0,"Zakarian, Mr. Ortin",27.000000,0,0,2670,7.2250,Unknown,C,Unknown,0.0,Unknown,0,1


In [14]:
def createDummies (dataframe, var_name):
    try:
        dummy =pd.get_dummies(data[var_name], prefix=var_name)
        dataframe = dataframe.drop([var_name], axis = 1)
        dataframe = pd.concat([dataframe, dummy], axis = 1)
    except KeyError as e:
        print("Key {} not found in dataframe: KeyError""{}""".format(var_name, e))
    return dataframe    
    
# createDummies (data, "sex")
data = createDummies (data, "embarked")


In [15]:
data

Unnamed: 0,pclass,survived,name,age,sibsp,parch,ticket,fare,cabin,boat,body,home.dest,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,"Allen, Miss. Elisabeth Walton",29.000000,0,0,24160,211.3375,B5,2,0.0,"St Louis, MO",1,0,0,0,1
1,1,1,"Allison, Master. Hudson Trevor",0.916700,1,2,113781,151.5500,C22 C26,11,0.0,"Montreal, PQ / Chesterville, ON",0,1,0,0,1
2,1,0,"Allison, Miss. Helen Loraine",2.000000,1,2,113781,151.5500,C22 C26,Unknown,0.0,"Montreal, PQ / Chesterville, ON",1,0,0,0,1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",30.000000,1,2,113781,151.5500,C22 C26,Unknown,135.0,"Montreal, PQ / Chesterville, ON",0,1,0,0,1
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.000000,1,2,113781,151.5500,C22 C26,Unknown,0.0,"Montreal, PQ / Chesterville, ON",1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",14.500000,1,0,2665,14.4542,Unknown,Unknown,328.0,Unknown,1,0,1,0,0
1305,3,0,"Zabour, Miss. Thamine",29.881135,1,0,2665,14.4542,Unknown,Unknown,0.0,Unknown,1,0,1,0,0
1306,3,0,"Zakarian, Mr. Mapriededer",26.500000,0,0,2656,7.2250,Unknown,Unknown,304.0,Unknown,0,1,1,0,0
1307,3,0,"Zakarian, Mr. Ortin",27.000000,0,0,2670,7.2250,Unknown,Unknown,0.0,Unknown,0,1,1,0,0
