# RUTINA DE PREPROCESAMIENTO DE DATOS

[Dataset Search](https://datasetsearch.research.google.com/)

In [12]:
import pandas as pd

pd.options.display.float_format = '{:_.1f}'.format

## Analizando un DF

In [13]:
df_meteorites = pd.read_csv('files/Meteorite_Landings.csv')
df_meteorites.sample(4)

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
41126,Yamato 791934,27283,Valid,H4,40.0,Found,01/01/1979 12:00:00 AM,-71.5,35.7,"(-71.5, 35.66667)"
31144,Park,18104,Valid,L6,13_000.0,Found,01/01/1969 12:00:00 AM,39.1,-100.4,"(39.11, -100.36167)"
2453,Allan Hills A77014,1329,Valid,H5,308.8,Found,01/01/1977 12:00:00 AM,-76.7,159.7,"(-76.71667, 159.66667)"
5857,Dar al Gani 104,5654,Valid,H6,269.0,Found,01/01/1996 12:00:00 AM,27.1,16.1,"(27.13533, 16.052)"


In [14]:
df_meteorites.shape

(45716, 10)

In [15]:
df_meteorites.describe()

Unnamed: 0,id,mass (g),reclat,reclong
count,45_716.0,45_585.0,38_401.0,38_401.0
mean,26_889.7,13_278.1,-39.1,61.1
std,16_860.7,574_988.9,46.4,80.6
min,1.0,0.0,-87.4,-165.4
25%,12_688.8,7.2,-76.7,0.0
50%,24_261.5,32.6,-71.5,35.7
75%,40_656.8,202.6,0.0,157.2
max,57_458.0,60_000_000.0,81.2,354.5


In [16]:
df_meteorites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  object 
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 3.5+ MB


Nueva funcionalidad que permite encontrar el formato más adecudo par las varaibles y convertirlas.

Int64 a diferencia de int64 (minuscula) puede entender los NaN

In [17]:
df_meteorites = df_meteorites.convert_dtypes()
df_meteorites.dtypes

name            string
id               Int64
nametype        string
recclass        string
mass (g)       Float64
fall            string
year            string
reclat         Float64
reclong        Float64
GeoLocation     string
dtype: object

## Categorización

conteo de valores distintos en cada columna

In [18]:
df_meteorites.nunique()

name           45716
id             45716
nametype           2
recclass         466
mass (g)       12576
fall               2
year             266
reclat         12738
reclong        14640
GeoLocation    17100
dtype: int64

*nametype* y *fall* son variables cetegóricas.

In [19]:
df_meteorites.nametype.unique()

<StringArray>
['Valid', 'Relict']
Length: 2, dtype: string

In [20]:
df_meteorites.fall.unique()

<StringArray>
['Fell', 'Found']
Length: 2, dtype: string

In [21]:
df_meteorites['fall'].unique()

<StringArray>
['Fell', 'Found']
Length: 2, dtype: string

In [36]:
df_meteorites['fall'].value_counts()

Found    44609
Fell      1107
Name: fall, dtype: int64

The function is a shortcut, as it is actually a groupby operation in combination with counting of the number of records within each group:

```
titanic.groupby("fall")["fall"].count()
```

Vamos a convertir el tipo de datos de estas columnas a tipo *category*.

In [23]:
df_meteorites[['nametype','fall']] = df_meteorites[['nametype','fall']].astype('category')
df_meteorites.dtypes

name             string
id                Int64
nametype       category
recclass         string
mass (g)        Float64
fall           category
year             string
reclat          Float64
reclong         Float64
GeoLocation      string
dtype: object

In [24]:
pd.get_dummies(df_meteorites['fall'])

Unnamed: 0,Fell,Found
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
45711,0,1
45712,0,1
45713,0,1
45714,0,1


Con esto estamos finalmente categorizando la columna *fall* en dos columnas (fell y found). Esto es el mapeo de variables categóricas que vimos en el curso de probabilidad.

In [25]:
df_meteorites[['fell','found']] = pd.get_dummies(df_meteorites['fall'])
df_meteorites.head(4)

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,fell,found
0,Aachen,1,Valid,L5,21.0,Fell,01/01/1880 12:00:00 AM,50.8,6.1,"(50.775, 6.08333)",1,0
1,Aarhus,2,Valid,H6,720.0,Fell,01/01/1951 12:00:00 AM,56.2,10.2,"(56.18333, 10.23333)",1,0
2,Abee,6,Valid,EH4,107_000.0,Fell,01/01/1952 12:00:00 AM,54.2,-113.0,"(54.21667, -113.0)",1,0
3,Acapulco,10,Valid,Acapulcoite,1_914.0,Fell,01/01/1976 12:00:00 AM,16.9,-99.9,"(16.88333, -99.9)",1,0


Ahora veamos la categorización de la columna *year*.

In [26]:
df_meteorites['year']

0        01/01/1880 12:00:00 AM
1        01/01/1951 12:00:00 AM
2        01/01/1952 12:00:00 AM
3        01/01/1976 12:00:00 AM
4        01/01/1902 12:00:00 AM
                  ...          
45711    01/01/1990 12:00:00 AM
45712    01/01/1999 12:00:00 AM
45713    01/01/1939 12:00:00 AM
45714    01/01/2003 12:00:00 AM
45715    01/01/1976 12:00:00 AM
Name: year, Length: 45716, dtype: string

In [27]:
pd.to_datetime(
    df_meteorites['year'], 
    errors='coerce',
    format='%m/%d/%Y %H:%M:%S %p'
)

0       1880-01-01 12:00:00
1       1951-01-01 12:00:00
2       1952-01-01 12:00:00
3       1976-01-01 12:00:00
4       1902-01-01 12:00:00
                ...        
45711   1990-01-01 12:00:00
45712   1999-01-01 12:00:00
45713   1939-01-01 12:00:00
45714   2003-01-01 12:00:00
45715   1976-01-01 12:00:00
Name: year, Length: 45716, dtype: datetime64[ns]

In [28]:
df_meteorites['year'] = pd.to_datetime(
    df_meteorites['year'], 
    errors='coerce',
    format='%m/%d/%Y %H:%M:%S %p'
)
df_meteorites.dtypes

name                   string
id                      Int64
nametype             category
recclass               string
mass (g)              Float64
fall                 category
year           datetime64[ns]
reclat                Float64
reclong               Float64
GeoLocation            string
fell                    uint8
found                   uint8
dtype: object

La categorización reduce el uso de memoria RAM y de almacenamiento.

## Renombramiento de las columnas

In [29]:
df_meteorites.rename(columns={'mass (g)':'mass'}, inplace=True)
df_meteorites.head(4)

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01 12:00:00,50.8,6.1,"(50.775, 6.08333)",1,0
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01 12:00:00,56.2,10.2,"(56.18333, 10.23333)",1,0
2,Abee,6,Valid,EH4,107_000.0,Fell,1952-01-01 12:00:00,54.2,-113.0,"(54.21667, -113.0)",1,0
3,Acapulco,10,Valid,Acapulcoite,1_914.0,Fell,1976-01-01 12:00:00,16.9,-99.9,"(16.88333, -99.9)",1,0


## Borrar columa 

In [30]:
df_meteorites['ones'] = 1
df_meteorites.head(1)

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found,ones
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01 12:00:00,50.8,6.1,"(50.775, 6.08333)",1,0,1


In [31]:
# df_meteorites.drop(['ones'], axis=1, inplace=True)
df_meteorites.drop(columns='ones', inplace=True)
df_meteorites.head(1)

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01 12:00:00,50.8,6.1,"(50.775, 6.08333)",1,0


## Borrar fila

In [32]:
# df_meteorites.drop([0,2,4,6]).head(5)
# df_meteorites.drop(index=[0,2,4,6]).head(5)
df_meteorites.drop([i for i in range(0, len(df_meteorites.index), 2)]).tail(5)

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found
45707,Zhaoping,54609,Valid,"Iron, IAB complex",2_000_000.0,Found,1983-01-01 12:00:00,24.2,111.2,"(24.23333, 111.18333)",0,1
45709,Zhongxiang,30406,Valid,Iron,100_000.0,Found,1981-01-01 12:00:00,31.2,112.5,"(31.2, 112.5)",0,1
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990-01-01 12:00:00,29.0,17.0,"(29.037, 17.0185)",0,1
45713,Zlin,30410,Valid,H4,3.3,Found,1939-01-01 12:00:00,49.2,17.7,"(49.25, 17.66667)",0,1
45715,Zulu Queen,30414,Valid,L3.7,200.0,Found,1976-01-01 12:00:00,34.0,-115.7,"(33.98333, -115.68333)",0,1


## Copia del DF

In [33]:
df_copy = df_meteorites.copy()