In [1]:
import pandas as pd
import datetime
import numpy as np

In [2]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/JackyP/testing/master/datasets/nycflights.csv",
    index_col=0
    )
df.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
1,2013,1,1,517.0,2.0,830.0,11.0,UA,N14228,1545,EWR,IAH,227.0,1400,5.0,17.0
2,2013,1,1,533.0,4.0,850.0,20.0,UA,N24211,1714,LGA,IAH,227.0,1416,5.0,33.0
3,2013,1,1,542.0,2.0,923.0,33.0,AA,N619AA,1141,JFK,MIA,160.0,1089,5.0,42.0
4,2013,1,1,544.0,-1.0,1004.0,-18.0,B6,N804JB,725,JFK,BQN,183.0,1576,5.0,44.0
5,2013,1,1,554.0,-6.0,812.0,-25.0,DL,N668DN,461,LGA,ATL,116.0,762,5.0,54.0


### Seleção de colunas & Tratamento dos nulos

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 336776 entries, 1 to 336776
Data columns (total 16 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   year       336776 non-null  int64  
 1   month      336776 non-null  int64  
 2   day        336776 non-null  int64  
 3   dep_time   328521 non-null  float64
 4   dep_delay  328521 non-null  float64
 5   arr_time   328063 non-null  float64
 6   arr_delay  327346 non-null  float64
 7   carrier    336776 non-null  object 
 8   tailnum    334264 non-null  object 
 9   flight     336776 non-null  int64  
 10  origin     336776 non-null  object 
 11  dest       336776 non-null  object 
 12  air_time   327346 non-null  float64
 13  distance   336776 non-null  int64  
 14  hour       328521 non-null  float64
 15  minute     328521 non-null  float64
dtypes: float64(7), int64(5), object(4)
memory usage: 43.7+ MB


Critérios :
1. ["arr_time", "dep_time", "carrier", "flight"] não podem ser nulos
2. As observações nulas destes campos devem ser removidos
3. Se fosse para tratar os nulos usar: ex df["dep_time"].fillna("NA")

In [4]:
usecols=["dep_time","arr_time","carrier","flight", "tailnum","air_time","distance"]

In [5]:
df_raw = df.loc[
    (~df["arr_time"].isna()) \
    & (~df["dep_time"].isna()) \
    & (~df["carrier"].isna()) \
    & (~df["flight"].isna())
].loc[:, usecols]

In [6]:
df_raw["air_time"] = df_raw["air_time"].fillna(0)

In [7]:
df_raw.drop_duplicates(inplace=True)

In [8]:
df_raw = df_raw.astype("object")

Perguntas:
1. Qual a vantagem de avaliar os nulos no começo do processo? Podemos pegar erro logo de início
2. Quais os riscos envolvidos? Se estvermos em um pipeline, perdemos a "foto" do dado bruto para uma camada raw. A boa prática em um pipeline seria realizar apenas a transformação para str / objeto

In [9]:
tmp = df.copy()
for col in ["arr_time", "dep_time", "carrier", "flight"]:
    tmp_df = tmp.loc[~df[col].isna()]
    tmp = tmp_df.copy()

In [10]:
tmp.shape[0] == df_raw.shape[0]

False

### Rename e tipagem

In [11]:
new_columns = ["datetime_partida", "datetime_chegada", "companhia", "id_voo", "id_aeronave","tempo_voo", "distancia"]

In [12]:
columns_map = {usecols[i]: new_columns[i] for i in range(len(usecols))}
columns_map

{'dep_time': 'datetime_partida',
 'arr_time': 'datetime_chegada',
 'carrier': 'companhia',
 'flight': 'id_voo',
 'tailnum': 'id_aeronave',
 'air_time': 'tempo_voo',
 'distance': 'distancia'}

In [13]:
df_work = df_raw.copy()
df_work.rename(columns=columns_map, inplace=True)
df_work.head()

Unnamed: 0,datetime_partida,datetime_chegada,companhia,id_voo,id_aeronave,tempo_voo,distancia
1,517.0,830.0,UA,1545,N14228,227.0,1400
2,533.0,850.0,UA,1714,N24211,227.0,1416
3,542.0,923.0,AA,1141,N619AA,160.0,1089
4,544.0,1004.0,B6,725,N804JB,183.0,1576
5,554.0,812.0,DL,461,N668DN,116.0,762


In [14]:
df_work.dtypes

datetime_partida    object
datetime_chegada    object
companhia           object
id_voo              object
id_aeronave         object
tempo_voo           object
distancia           object
dtype: object

In [15]:
df_work["tempo_voo"] = df_work.loc[:,"tempo_voo"].astype(float)
df_work["distancia"] = df_work.loc[:,"distancia"].astype(float)
df_work["companhia"] = df_work.loc[:,"companhia"].astype(str)
df_work["id_voo"] = df_work.loc[:,"id_voo"].astype(str)
df_work["id_aeronave"] = df_work.loc[:,"id_aeronave"].astype(str)
df_work["datetime_partida"] = df_work.loc[:,"datetime_partida"].astype(str)
df_work["datetime_chegada"] = df_work.loc[:,"datetime_chegada"].astype(str)

In [16]:
df_work.dtypes

datetime_partida     object
datetime_chegada     object
companhia            object
id_voo               object
id_aeronave          object
tempo_voo           float64
distancia           float64
dtype: object

### Tratamento strings

In [17]:
import re

def padroniza_str(obs):
    return re.sub('[^A-Za-z0-9]+', '', obs.lower())

In [18]:
padroniza_str("AHU! @ ++  ~ ç 3n!!OPBB")

'ahu3nopbb'

Guias práticos para regex:
- https://medium.com/@tomstaite1/everything-you-need-to-know-about-regular-expressions-regex-3cbc5b95146
- https://medium.com/xp-inc/regex-um-guia-pratico-para-express%C3%B5es-regulares-1ac5fa4dd39f

In [19]:
df_work["companhia_formatted"] = df_work.loc[:,"companhia"].apply(lambda x: padroniza_str(x))
df_work["id_voo_formatted"] = df_work.loc[:,"id_voo"].apply(lambda x: padroniza_str(x))
df_work["id_aeronave_formatted"] = df_work.loc[:,"id_aeronave"].apply(lambda x: padroniza_str(x))

In [20]:
df_work.head()

Unnamed: 0,datetime_partida,datetime_chegada,companhia,id_voo,id_aeronave,tempo_voo,distancia,companhia_formatted,id_voo_formatted,id_aeronave_formatted
1,517.0,830.0,UA,1545,N14228,227.0,1400.0,ua,1545,n14228
2,533.0,850.0,UA,1714,N24211,227.0,1416.0,ua,1714,n24211
3,542.0,923.0,AA,1141,N619AA,160.0,1089.0,aa,1141,n619aa
4,544.0,1004.0,B6,725,N804JB,183.0,1576.0,b6,725,n804jb
5,554.0,812.0,DL,461,N668DN,116.0,762.0,dl,461,n668dn


### Tratamentos especiais
Neste caso, vamos realizar tratamento das horas

In [21]:
df_work.loc[:,"datetime_partida"] = df_work.loc[:,"datetime_partida"].str.replace('.0', '')
df_work.loc[:,"datetime_chegada"] = df_work.loc[:,"datetime_chegada"].str.replace('.0', '')
df_work["data_voo"] = pd.to_datetime(df[['year', 'month', 'day']]) 

In [22]:
df_work["datetime_partida"].apply(lambda x: len(x)).unique()

array([3, 4, 2, 1], dtype=int64)

In [23]:
def corrige_hora(hr_str,dct_hora = {1:"000?",2:"00?",3:"0?",4:"?"}):
    if hr_str == "2400":
        return "00:00"
    elif (len(hr_str) == 2) & (int(hr_str) <= 12):
        return f"0{hr_str[0]}:{hr_str[1]}0"
    else:
        hora = dct_hora[len(hr_str)].replace("?", hr_str)
        return f"{hora[:2]}:{hora[2:]}"

In [24]:
datetime_partida2 = df_work.loc[:,"datetime_partida"].apply(lambda x: corrige_hora(x))
datetime_chegada2 = df_work.loc[:,"datetime_chegada"].apply(lambda x: corrige_hora(x))

In [25]:
datetime_partida2

1         05:17
2         05:33
3         05:42
4         05:44
5         05:54
          ...  
336766    22:40
336767    22:40
336768    22:41
336769    23:07
336770    23:49
Name: datetime_partida, Length: 328038, dtype: object

In [26]:
pd.to_datetime(df_work.loc[:,'data_voo'].astype(str) + " " + datetime_partida2)

1        2013-01-01 05:17:00
2        2013-01-01 05:33:00
3        2013-01-01 05:42:00
4        2013-01-01 05:44:00
5        2013-01-01 05:54:00
                 ...        
336766   2013-09-30 22:40:00
336767   2013-09-30 22:40:00
336768   2013-09-30 22:41:00
336769   2013-09-30 23:07:00
336770   2013-09-30 23:49:00
Length: 328038, dtype: datetime64[ns]

In [27]:
df_work['datetime_partida_formatted'] = pd.to_datetime(df_work.loc[:,'data_voo'].astype(str) + " " + datetime_partida2)
df_work['datetime_chegada_formatted'] = pd.to_datetime(df_work.loc[:,'data_voo'].astype(str) + " " + datetime_chegada2)

In [28]:
df_work.head()

Unnamed: 0,datetime_partida,datetime_chegada,companhia,id_voo,id_aeronave,tempo_voo,distancia,companhia_formatted,id_voo_formatted,id_aeronave_formatted,data_voo,datetime_partida_formatted,datetime_chegada_formatted
1,517,830,UA,1545,N14228,227.0,1400.0,ua,1545,n14228,2013-01-01,2013-01-01 05:17:00,2013-01-01 08:30:00
2,533,850,UA,1714,N24211,227.0,1416.0,ua,1714,n24211,2013-01-01,2013-01-01 05:33:00,2013-01-01 08:50:00
3,542,923,AA,1141,N619AA,160.0,1089.0,aa,1141,n619aa,2013-01-01,2013-01-01 05:42:00,2013-01-01 09:23:00
4,544,1004,B6,725,N804JB,183.0,1576.0,b6,725,n804jb,2013-01-01,2013-01-01 05:44:00,2013-01-01 10:04:00
5,554,812,DL,461,N668DN,116.0,762.0,dl,461,n668dn,2013-01-01,2013-01-01 05:54:00,2013-01-01 08:12:00


In [29]:
df_work["datetime_chegada_formatted"] = np.where(
    df_work["datetime_partida_formatted"] > df_work["datetime_chegada_formatted"],
    df_work["datetime_chegada_formatted"] + pd.Timedelta(days=1),
    df_work["datetime_chegada_formatted"]
    )

In [30]:
df_work.loc[df_work["datetime_partida_formatted"] > df_work["datetime_chegada_formatted"]]

Unnamed: 0,datetime_partida,datetime_chegada,companhia,id_voo,id_aeronave,tempo_voo,distancia,companhia_formatted,id_voo_formatted,id_aeronave_formatted,data_voo,datetime_partida_formatted,datetime_chegada_formatted


### Transformação de Dados

- tempo_voo_esperado
- tempo_voo_hr
- atraso (em horas)
- flg_potencial_erro
- flg_atraso
- flg_adiantado


In [31]:
df_dw = df_work[["data_voo", "companhia_formatted", "id_voo_formatted", "id_aeronave_formatted","datetime_partida_formatted", "datetime_chegada_formatted", "tempo_voo", "distancia"]].copy()

In [32]:
df_dw["tempo_voo_esperado"] = (df_dw["datetime_chegada_formatted"] - df_dw["datetime_partida_formatted"]) / pd.Timedelta(hours=1)
df_dw["tempo_voo_hr"] = df_dw["tempo_voo"] /60
df_dw["atraso"] = df_dw["tempo_voo_hr"] - df_dw["tempo_voo_esperado"]

In [33]:
df_dw.head()

Unnamed: 0,data_voo,companhia_formatted,id_voo_formatted,id_aeronave_formatted,datetime_partida_formatted,datetime_chegada_formatted,tempo_voo,distancia,tempo_voo_esperado,tempo_voo_hr,atraso
1,2013-01-01,ua,1545,n14228,2013-01-01 05:17:00,2013-01-01 08:30:00,227.0,1400.0,3.216667,3.783333,0.566667
2,2013-01-01,ua,1714,n24211,2013-01-01 05:33:00,2013-01-01 08:50:00,227.0,1416.0,3.283333,3.783333,0.5
3,2013-01-01,aa,1141,n619aa,2013-01-01 05:42:00,2013-01-01 09:23:00,160.0,1089.0,3.683333,2.666667,-1.016667
4,2013-01-01,b6,725,n804jb,2013-01-01 05:44:00,2013-01-01 10:04:00,183.0,1576.0,4.333333,3.05,-1.283333
5,2013-01-01,dl,461,n668dn,2013-01-01 05:54:00,2013-01-01 08:12:00,116.0,762.0,2.3,1.933333,-0.366667


In [34]:
df_dw[df_dw["atraso"] > 5]

Unnamed: 0,data_voo,companhia_formatted,id_voo_formatted,id_aeronave_formatted,datetime_partida_formatted,datetime_chegada_formatted,tempo_voo,distancia,tempo_voo_esperado,tempo_voo_hr,atraso
27472,2013-10-01,ua,15,n76064,2013-10-01 13:41:00,2013-10-01 18:20:00,618.0,4963.0,4.650000,10.300000,5.650000
28260,2013-10-02,ha,51,n381ha,2013-10-02 09:51:00,2013-10-02 14:38:00,623.0,4983.0,4.783333,10.383333,5.600000
28433,2013-10-02,ua,15,n69063,2013-10-02 13:26:00,2013-10-02 17:48:00,598.0,4963.0,4.366667,9.966667,5.600000
29418,2013-10-03,ua,15,n76055,2013-10-03 13:40:00,2013-10-03 18:03:00,601.0,4963.0,4.383333,10.016667,5.633333
30230,2013-10-04,ha,51,n380ha,2013-10-04 09:54:00,2013-10-04 14:38:00,618.0,4983.0,4.733333,10.300000,5.566667
...,...,...,...,...,...,...,...,...,...,...,...
334537,2013-09-28,ua,15,n66057,2013-09-28 13:27:00,2013-09-28 17:10:00,562.0,4963.0,3.716667,9.366667,5.650000
335096,2013-09-29,ha,51,n384ha,2013-09-29 09:57:00,2013-09-29 14:05:00,580.0,4983.0,4.133333,9.666667,5.533333
335302,2013-09-29,ua,15,n66057,2013-09-29 14:20:00,2013-09-29 18:17:00,569.0,4963.0,3.950000,9.483333,5.533333
336082,2013-09-30,ha,51,n392ha,2013-09-30 09:59:00,2013-09-30 14:38:00,603.0,4983.0,4.650000,10.050000,5.400000


In [35]:
df_dw.describe()

Unnamed: 0,data_voo,datetime_partida_formatted,datetime_chegada_formatted,tempo_voo,distancia,tempo_voo_esperado,tempo_voo_hr,atraso
count,328038,328038,328038,328038.0,328038.0,328038.0,328038.0,328038.0
mean,2013-07-03 03:59:56.839390976,2013-07-03 17:41:51.662429440,2013-07-03 19:59:38.359702272,150.35955,1048.845923,2.296305,2.505992,0.209688
min,2013-01-01 00:00:00,2013-01-01 05:17:00,2013-01-01 07:02:00,0.0,80.0,0.0,0.0,-23.333333
25%,2013-04-05 00:00:00,2013-04-05 05:59:00,2013-04-05 08:02:15,82.0,509.0,1.583333,1.366667,-0.5
50%,2013-07-04 00:00:00,2013-07-04 09:14:30,2013-07-04 11:27:00,129.0,888.0,2.3,2.15,-0.316667
75%,2013-10-01 00:00:00,2013-10-01 17:21:30,2013-10-01 19:23:45,191.0,1391.0,2.9,3.183333,0.633333
max,2013-12-31 00:00:00,2013-12-31 23:56:00,2014-01-01 04:36:00,695.0,4983.0,23.983333,11.583333,5.75
std,,,,93.848426,735.948349,0.851282,1.56414,1.157143


In [36]:
df_dw[df_dw["atraso"]<-20]

Unnamed: 0,data_voo,companhia_formatted,id_voo_formatted,id_aeronave_formatted,datetime_partida_formatted,datetime_chegada_formatted,tempo_voo,distancia,tempo_voo_esperado,tempo_voo_hr,atraso
36466,2013-10-11,b6,1816,n283jb,2013-10-11 01:10:00,2013-10-12 01:09:00,39.0,209.0,23.983333,0.65,-23.333333
84149,2013-12-02,b6,1816,n316jb,2013-12-02 01:20:00,2013-12-03 01:03:00,35.0,209.0,23.716667,0.583333,-23.133333
182236,2013-04-19,b6,1178,n266jb,2013-04-19 01:10:00,2013-04-20 01:08:00,43.0,200.0,23.966667,0.716667,-23.25
199932,2013-05-08,b6,30,n183jb,2013-05-08 01:20:00,2013-05-09 01:11:00,46.0,264.0,23.85,0.766667,-23.083333
248721,2013-06-29,b6,718,n198jb,2013-06-29 01:10:00,2013-06-30 00:59:00,34.0,187.0,23.816667,0.566667,-23.25
251417,2013-07-02,b6,108,n318jb,2013-07-02 01:20:00,2013-07-03 01:17:00,45.0,273.0,23.95,0.75,-23.2


In [37]:
df_dw["flg_potencial_erro"] = np.where(df_dw["atraso"]<= -10, True, False)
df_dw["flg_atraso"] = np.where(df_dw["atraso"] > 0.6, True, False)
df_dw["flg_adiantado"] = np.where(
    (df_dw["atraso"]< -0.5) & (df_dw["atraso"] > -10), True, False)

In [38]:
df_dw.head()

Unnamed: 0,data_voo,companhia_formatted,id_voo_formatted,id_aeronave_formatted,datetime_partida_formatted,datetime_chegada_formatted,tempo_voo,distancia,tempo_voo_esperado,tempo_voo_hr,atraso,flg_potencial_erro,flg_atraso,flg_adiantado
1,2013-01-01,ua,1545,n14228,2013-01-01 05:17:00,2013-01-01 08:30:00,227.0,1400.0,3.216667,3.783333,0.566667,False,False,False
2,2013-01-01,ua,1714,n24211,2013-01-01 05:33:00,2013-01-01 08:50:00,227.0,1416.0,3.283333,3.783333,0.5,False,False,False
3,2013-01-01,aa,1141,n619aa,2013-01-01 05:42:00,2013-01-01 09:23:00,160.0,1089.0,3.683333,2.666667,-1.016667,False,False,True
4,2013-01-01,b6,725,n804jb,2013-01-01 05:44:00,2013-01-01 10:04:00,183.0,1576.0,4.333333,3.05,-1.283333,False,False,True
5,2013-01-01,dl,461,n668dn,2013-01-01 05:54:00,2013-01-01 08:12:00,116.0,762.0,2.3,1.933333,-0.366667,False,False,False


### Validações

Para validar os nulos é preciso entender a proporção que uma variável possui, e se essa proporção está dentro dos critérios estabelecidos no projeto

In [39]:
len(df.loc[df["tailnum"].isnull()])/len(df)

0.007458963821649999

Para validar as chaves basta saber se o campo(s) chave(s) possuem a mesma quantidade de observações únicos em relação a quantidade de observações da base

In [40]:
len(df_work[['companhia_formatted','datetime_partida_formatted', "id_voo",'datetime_chegada_formatted']].drop_duplicates()) == len(df_work)

True

Para criarmos alertas e/ou registramos em logs o resultado das validações

In [41]:
import datetime
import logging

logging.basicConfig(filename='flights_pipe.log', level=logging.INFO) #vai criar um arquivo para os logs
logger = logging.getLogger()

In [42]:
logger.info(f'Inicio da execução ; {datetime.datetime.now()}')

In [43]:
if len(df.loc[df["tailnum"].isnull()])/len(df) > 0.00001:
    logger.warning(f"Coluna tailnum possui mais nulos do que o esperado; {datetime.datetime.now()} ")

In [45]:
#Gera uma falha no processo
if len(df.loc[df["tailnum"].isnull()])/len(df) > 0.00001:
    logger.error(f"Coluna tailnum possui mais nulos do que o esperado; {datetime.datetime.now()} ")
    raise Exception("Coluna tailnum possui mais nulos do que o esperado")

Exception: Coluna tailnum possui mais nulos do que o esperado