## CheatSheet - Exploración de datos usando Pandas 


In [2]:
import pandas as pd

## 1.Lectura de los datos

In [3]:
data = pd.read_csv("titanic_train.csv")

## 2.Obtener una primera vista de los datos

In [4]:
data.shape #tamaño del dataset

(891, 12)

In [5]:
data.columns #columnas del dataset

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
data.dtypes #tipo de datos de las columnas 

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
data.head(2) #visualizar las dos primeras filas

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
data.tail(2) #visualizar las dos ultimas filas

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [9]:
data.sample(2) #visualizar dos filas de forma aleatoria

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
218,219,1,1,"Bazzani, Miss. Albina",female,32.0,0,0,11813,76.2917,D15,C
487,488,0,1,"Kent, Mr. Edward Austin",male,58.0,0,0,11771,29.7,B37,C


## 3.Renombrar una columna 

In [10]:
data.rename(columns={"PassengerId": "ID"}, inplace = True) 
#para generar el cambio en nuestro dataset utilizamos'inplace = True'

## 4.Seleccionar y filtrar 

In [11]:
data[['Name','Age']].sample(5)
#con los corchetes y nombres de columnas podemos acceder a un subconjunto del dataset

Unnamed: 0,Name,Age
216,"Honkanen, Miss. Eliina",27.0
719,"Johnson, Mr. Malkolm Joackim",33.0
580,"Christy, Miss. Julie Rachel",25.0
475,"Clifford, Mr. George Quincy",
214,"Kiernan, Mr. Philip",


In [12]:
data[data["Fare"] > 400] #filtramos los valores de la columna Fare mayores a 400

Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C


Pandas provee un método general muy versátil para seleccionar elementos en base a las etiquetas. 
Esto se hace a través del atributo .loc.

In [13]:
boletos_3era_clase = data.loc[data['Pclass'] == 3,['Name','Fare']]

In [14]:
boletos_3era_clase
#Obtenemos el nombre y la tarifa abonada por los pasajeros que viajaron en 3era clase 

Unnamed: 0,Name,Fare
0,"Braund, Mr. Owen Harris",7.2500
2,"Heikkinen, Miss. Laina",7.9250
4,"Allen, Mr. William Henry",8.0500
5,"Moran, Mr. James",8.4583
7,"Palsson, Master. Gosta Leonard",21.0750
...,...,...
882,"Dahlberg, Miss. Gerda Ulrika",10.5167
884,"Sutehall, Mr. Henry Jr",7.0500
885,"Rice, Mrs. William (Margaret Norton)",29.1250
888,"Johnston, Miss. Catherine Helen ""Carrie""",23.4500


A su vez existe el método iloc para acceder a los elementos según las posiciones. En este caso los
números representan las posiciones y no las etiquetas.

In [15]:
data.iloc[125,4] #fila 125 columna 4 

'male'

## 5.Manejo de valores faltantes 

Obtener valores faltantes 

In [16]:
print(data.isnull().sum(axis = 0))

ID            0
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64


Rellenar los valores faltantes de la columna edad con la media 

In [17]:
media_edad = round(data['Age'].mean()) #Obtenemos la media de la columna edad 
media_edad 

30

In [18]:
data['Age'].fillna(media_edad, inplace=True) #Reemplazamos los valores nulos de la columna edad por la media de la edad 

In [19]:
print(data.isnull().sum(axis = 0)) #Chequeamos que ya no existan valores nulos en la columna edad

ID            0
Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64


## 6.Crear una nueva columna 

In [20]:
data['Tarifa_con_impuestos'] = data['Fare'] * 1.25

In [21]:
data.head(3) #Chequeamos la nueva columna que creamos 

Unnamed: 0,ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Tarifa_con_impuestos
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,9.0625
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,89.104125
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,9.90625


## 7. Identificar valores unicos 

In [22]:
data['Sex'].unique()

array(['male', 'female'], dtype=object)

In [23]:
data['Pclass'].unique()

array([3, 1, 2], dtype=int64)

## 8. Realizar calculos y agrupaciones 

In [25]:
# Edad promedio de los pasajeros de cada clase
round(data[['Pclass', 'Age']].groupby('Pclass').mean())

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,37.0
2,30.0
3,26.0


In [26]:
# Cantidad de pasajeros por clase 
data[['Pclass', 'ID']].groupby('Pclass').count()

Unnamed: 0_level_0,ID
Pclass,Unnamed: 1_level_1
1,216
2,184
3,491


In [27]:
#Cantidad de hombres 
(data['Sex'] == 'male').sum()

577

In [30]:
#Cantidad de no sobrevivientes 
(data['Survived'] == 0).sum()

549

## 9.Estadisticos basicos 

In [31]:
#Este método nos da información sobre las variables numéricas, nos devuelve los estadisticos basicos
data.describe() 

Unnamed: 0,ID,Survived,Pclass,Age,SibSp,Parch,Fare,Tarifa_con_impuestos
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.758889,0.523008,0.381594,32.204208,40.25526
std,257.353842,0.486592,0.836071,13.00257,1.102743,0.806057,49.693429,62.116786
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104,9.888
50%,446.0,0.0,3.0,30.0,0.0,0.0,14.4542,18.06775
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0,38.75
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,640.4115


In [32]:
#Correlación por pares de columnas, excluyendo NA/valores nulos
data.corr()

Unnamed: 0,ID,Survived,Pclass,Age,SibSp,Parch,Fare,Tarifa_con_impuestos
ID,1.0,-0.005007,-0.035144,0.033019,-0.057527,-0.001652,0.012658,0.012658
Survived,-0.005007,1.0,-0.338481,-0.070657,-0.035322,0.081629,0.257307,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.329727,0.083081,0.018443,-0.5495,-0.5495
Age,0.033019,-0.070657,-0.329727,1.0,-0.23244,-0.18033,0.090632,0.090632
SibSp,-0.057527,-0.035322,0.083081,-0.23244,1.0,0.414838,0.159651,0.159651
Parch,-0.001652,0.081629,0.018443,-0.18033,0.414838,1.0,0.216225,0.216225
Fare,0.012658,0.257307,-0.5495,0.090632,0.159651,0.216225,1.0,1.0
Tarifa_con_impuestos,0.012658,0.257307,-0.5495,0.090632,0.159651,0.216225,1.0,1.0


## by Maria Belen Camandone 