# Limpieza Events

En este notebook se busca realizar una limpieza de información del archivo events.csv, buscamos que los tipos de las columnas sean correctos y ocupen el menor espacio posible, y se analiza que datos son o no relevantes.

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
events = pd.read_csv('../Data/events.csv.gzip', compression = 'gzip', dtype={'event_id':np.int8,'application_id':np.int32,
                                                                            'connection_type':'category'}, parse_dates=['date'])

In [3]:
events.head()

Unnamed: 0,date,event_id,ref_type,ref_hash,application_id,attributed,device_countrycode,device_os_version,device_brand,device_model,...,trans_id,user_agent,event_uuid,carrier,kind,device_os,wifi,connection_type,ip_address,device_language
0,2019-03-05 00:09:36.966,0,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,,,a9c0b263-acb2-4577-92c5-cbde5d7a5db1,2.248157e+17,5.516623e+18,7.531669e+18,,Cable/DSL,7858558567428669000,4.077062e+17
1,2019-03-05 00:09:38.920,1,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,,,1cd98205-0d97-4ec2-a019-667997dbfe7a,2.248157e+17,9.97766e+17,7.531669e+18,,Cable/DSL,7858558567428669000,4.077062e+17
2,2019-03-05 00:09:26.195,0,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,,,f02e2924-21ae-492b-b625-9021ae0a4eca,2.248157e+17,5.516623e+18,7.531669e+18,,Cable/DSL,7858558567428669000,4.077062e+17
3,2019-03-05 00:09:31.107,2,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,,,a813cf45-a36e-4668-85e2-5395f1564e98,2.248157e+17,8.561153e+18,7.531669e+18,,Cable/DSL,6324037615828123965,4.077062e+17
4,2019-03-09 21:00:36.585,3,1891515180541284343,2635154697734164782,38,False,6333597102633388268,7.391844e+18,,5.960896e+18,...,,,63a4f0aa-e147-469f-8c55-4ca4f8d0e310,2.248157e+17,8.731902e+17,7.531669e+18,,Cable/DSL,2894495631302821483,3.301378e+18


In [4]:
events.shape

(2494423, 22)

In [5]:
#Calculo la cantidad de datos no nula de cada columna
(events.isnull().sum()*(-1)+2494423).sort_values()

trans_id                   82
connection_type        612463
device_city            614698
carrier                616434
device_os              657667
device_os_version     1022066
device_brand          1164963
wifi                  1378872
user_agent            1391527
device_model          2406456
device_language       2406604
session_user_agent    2482637
event_uuid            2489324
kind                  2489324
device_countrycode    2494423
attributed            2494423
application_id        2494423
ref_hash              2494423
ref_type              2494423
event_id              2494423
ip_address            2494423
date                  2494423
dtype: int64

In [6]:
#Device_countrycode solo contiene un valor en toda su columna
events['device_countrycode'].value_counts()

6333597102633388268    2494423
Name: device_countrycode, dtype: int64

In [7]:
events['trans_id'].value_counts()

{hash}                                                                                                           33
0                                                                                                                16
103430dcab4b60eb4f                                                                                                9
433f38e2c758468ab632dcab7281d4be_Y2NhPTEwLzI1LzIwMTggMTA6Mjk6MjUgUE0mb2ZmZXJJZD0zMzQ1NjQ0NiZhZmZJZD0yMjMyNzUx     7
210a4c5786d249c78bb30237abcac890_Y2NhPTQvMjEvMjAxOCA1OjI2OjM3IFBNJm9mZmVySWQ9MzM0NTY0NDYmYWZmSWQ9MTY2MTgxNQ==     6
1901171053a509cd7317f2c6                                                                                          2
0941bb7b-866f-4d5a-9b85-63e77b27d562                                                                              2
77ca31a9-b0e0-4884-8de8-c2ee74f1cc32                                                                              2
58ee68fa77874f8785ecbe8cf74e14de_Y2NhPTQvMjUvMjAxOCA5OjI5OjE4IFBNJm9mZmV

Como de los 2494423 registros que tenemos solo 82 tienen información en la columna "trans_id", dicha información no nos aporta mucho para el análisis exploratorio. Por lo tanto, decidimos descartar dicha columna

In [8]:
events.drop(columns='trans_id',inplace=True)

In [9]:
events.head()

Unnamed: 0,date,event_id,ref_type,ref_hash,application_id,attributed,device_countrycode,device_os_version,device_brand,device_model,...,session_user_agent,user_agent,event_uuid,carrier,kind,device_os,wifi,connection_type,ip_address,device_language
0,2019-03-05 00:09:36.966,0,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,7.164321e+18,,a9c0b263-acb2-4577-92c5-cbde5d7a5db1,2.248157e+17,5.516623e+18,7.531669e+18,,Cable/DSL,7858558567428669000,4.077062e+17
1,2019-03-05 00:09:38.920,1,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,7.164321e+18,,1cd98205-0d97-4ec2-a019-667997dbfe7a,2.248157e+17,9.97766e+17,7.531669e+18,,Cable/DSL,7858558567428669000,4.077062e+17
2,2019-03-05 00:09:26.195,0,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,7.164321e+18,,f02e2924-21ae-492b-b625-9021ae0a4eca,2.248157e+17,5.516623e+18,7.531669e+18,,Cable/DSL,7858558567428669000,4.077062e+17
3,2019-03-05 00:09:31.107,2,1891515180541284343,2688759737656491380,38,False,6333597102633388268,5.908703e+17,,5.990117e+18,...,7.164321e+18,,a813cf45-a36e-4668-85e2-5395f1564e98,2.248157e+17,8.561153e+18,7.531669e+18,,Cable/DSL,6324037615828123965,4.077062e+17
4,2019-03-09 21:00:36.585,3,1891515180541284343,2635154697734164782,38,False,6333597102633388268,7.391844e+18,,5.960896e+18,...,7.164321e+18,,63a4f0aa-e147-469f-8c55-4ca4f8d0e310,2.248157e+17,8.731902e+17,7.531669e+18,,Cable/DSL,2894495631302821483,3.301378e+18


In [10]:
events.dtypes

date                  datetime64[ns]
event_id                        int8
ref_type                       int64
ref_hash                       int64
application_id                 int32
attributed                      bool
device_countrycode             int64
device_os_version            float64
device_brand                 float64
device_model                 float64
device_city                  float64
session_user_agent           float64
user_agent                   float64
event_uuid                    object
carrier                      float64
kind                         float64
device_os                    float64
wifi                          object
connection_type             category
ip_address                     int64
device_language              float64
dtype: object

Para ejecutarlo en otro notebook
    
    %run Limpieza_Events.ipynb