<a href="https://colab.research.google.com/github/amandaveloso/Analise-Flights-Dataset/blob/main/Modelo_Regressivo_Flights_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Estudo dos dados de voos utilizando pandas

Análise exploratoria

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Converter a data para o timestamp Unix - número de segundos desde 1 de janeiro de 1970
from datetime import datetime


In [None]:
# Leitura dos arquivos em CSV
voos_2018 = pd.read_csv('Sample_combined_flights_2018.csv')
voos_2019 = pd.read_csv('Sample_combined_flights_2019.csv')
voos_2020 = pd.read_csv('Sample_combined_flights_2020.csv')
voos_2021 = pd.read_csv('Sample_combined_flights_2021.csv')
voos_2022 = pd.read_csv('Sample_combined_flights_2022.csv')
airlines = pd.read_csv('Airlines.csv')

In [None]:
#Concatenando os datasets

df = pd.concat([voos_2018, voos_2019, voos_2020, voos_2021, voos_2022], ignore_index=True)
df.head()

Unnamed: 0.1,Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,0,2018-01-05,Endeavor Air Inc.,ATL,ABY,False,False,1037,1032.0,0.0,...,1052.0,1121.0,3.0,1137,-13.0,0.0,-1.0,1100-1159,1,0.0
1,1,2018-01-14,Endeavor Air Inc.,ATL,ABY,False,False,1037,1031.0,0.0,...,1047.0,1117.0,3.0,1137,-17.0,0.0,-2.0,1100-1159,1,0.0
2,2,2018-01-04,Endeavor Air Inc.,EWN,ATL,True,False,1415,,,...,,,,1605,,,,1600-1659,2,0.0
3,3,2018-01-09,Endeavor Air Inc.,FAY,ATL,False,False,1853,1850.0,0.0,...,1900.0,2004.0,17.0,2030,-9.0,0.0,-1.0,2000-2059,2,0.0
4,4,2018-01-15,Endeavor Air Inc.,CSG,ATL,False,False,615,625.0,10.0,...,634.0,656.0,8.0,711,-7.0,0.0,-1.0,0700-0759,1,0.0


In [None]:
# Unindo com o nome da companhia aérea (aqruivo "Airlines")
merged_flights = pd.merge(df, airlines, left_on='IATA_Code_Marketing_Airline', right_on='Code', how='left')

In [None]:
# Filtrando os voos que não foram cancelados nem desviados, para fins de regressão linear (0  = não)
filtered_flights = merged_flights[(merged_flights['Cancelled'] == 0) & (merged_flights['Diverted'] == 0)]
filtered_flights = filtered_flights.dropna()

In [None]:
# Converter a data para o timestamp Unix
filtered_flights['FlightDate'] = filtered_flights['FlightDate'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').timestamp())


In [None]:
# Convertendo os dados para números inteiros
filtered_flights['FlightDate'] = filtered_flights['FlightDate'].astype('int64')
filtered_flights['AirTime'] = filtered_flights['AirTime'].astype('int64')
filtered_flights['ArrDel15'] = filtered_flights['ArrDel15'].astype('int64')

In [None]:
# Criando variáveis Dummy para os meses
dummy_months = pd.get_dummies(filtered_flights['Month'], prefix='Month')

filtered_flights = pd.concat([filtered_flights, dummy_months], axis=1)

In [None]:
# normalizando os dados utilizando Standart Scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

###`TESTE RL `

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

features = ['CRSElapsedTime', 'Distance', 'DistanceGroup', 'CRSDepTime', 'CRSArrTime',
            'DayOfWeek', 'Month', 'Operating_Airline', 'Flight_Number_Operating_Airline', 'OriginAirportID',
            'DestAirportID']

X = filtered_flights[features]
y = filtered_flights['ArrDel15']

# Divisão dos dados em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Transformação dos dados
numeric_features = ['CRSElapsedTime', 'Distance', 'CRSDepTime', 'CRSArrTime']
categorical_features = ['DistanceGroup', 'DayOfWeek', 'Month', 'Operating_Airline',
                        'Flight_Number_Operating_Airline', 'OriginAirportID', 'DestAirportID']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Criando um pipeline com a transformação e o modelo
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Treinando o modelo
model.fit(X_train, y_train)

# Fazendo previsões
y_pred = model.predict(X_test)

# Avaliando o modelo
r2_test = r2_score(y_test, y_pred)
mse_test = mean_squared_error(y_test, y_pred)

print(f"R² do modelo (teste): {r2_test}")
print(f"MSE do modelo (teste): {mse_test}")

# Avaliação no conjunto de treino
y_pred_train = model.predict(X_train)
r2_train = r2_score(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)

print(f'R² do modelo (treino): {r2_train}')
print(f'MAE (Mean Absolute Error) do modelo (treino): {mae_train}')

R² do modelo (teste): 0.0006637371159567973
MSE do modelo (teste): 0.14586185362948412
R² do modelo (treino): 0.0763884361102688
MAE (Mean Absolute Error) do modelo (treino): 0.27217390426510313


## Modelo de regressão linear

In [None]:
filtered_flights.columns

Index(['Unnamed: 0', 'FlightDate', 'Airline', 'Origin', 'Dest', 'Cancelled',
       'Diverted', 'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDelay',
       'ArrTime', 'ArrDelayMinutes', 'AirTime', 'CRSElapsedTime',
       'ActualElapsedTime', 'Distance', 'Year', 'Quarter', 'Month',
       'DayofMonth', 'DayOfWeek', 'Marketing_Airline_Network',
       'Operated_or_Branded_Code_Share_Partners', 'DOT_ID_Marketing_Airline',
       'IATA_Code_Marketing_Airline', 'Flight_Number_Marketing_Airline',
       'Operating_Airline', 'DOT_ID_Operating_Airline',
       'IATA_Code_Operating_Airline', 'Tail_Number',
       'Flight_Number_Operating_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName',
       'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'DepDel15',
       'DepartureDelayGroups', 

In [None]:
features = ['CRSDepTime', 'CRSArrTime', 'Distance', 'Operating_Airline', 'Origin', 'Dest']
X = filtered_flights[features]
y = filtered_flights['ArrDel15']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [None]:
numeric_features = ['CRSDepTime', 'CRSArrTime', 'Distance']
categorical_features = ['Operating_Airline', 'Origin', 'Dest']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# Avaliando o modelo
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R² do modelo: {r2}")
print(f"MSE do modelo: {mse}")

R² do modelo: 0.0030587649008418927
MSE do modelo: 0.1455122784112395


In [None]:
model.coef_

array([-2.45246203e-05, -1.90002504e-04, -4.83907110e-11, -3.78089101e-05,
        1.87549179e-04, -3.94660620e-06,  4.31191415e-05])

In [None]:
model.score(X, y)

0.0026837108264599063

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_train = model.predict(X_train)

mae = mean_absolute_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print(f'MAE (Mean Absolute Error): {mae}')
print(f'R²: {r2}')

MAE (Mean Absolute Error): 0.29117601445604563
R²: 0.0025229695230302163


## Modelo de regressão logística

In [None]:
features = ['OriginWac', 'DestWac', 'FlightDate', 'Month', 'AirTime', 'DOT_ID_Operating_Airline', 'DOT_ID_Marketing_Airline']
X = features
y = df['ArrDel15']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [None]:
model = LogisticRegression()
model.fit(X, y)

In [None]:
previsao = model.predict(X_test)


In [None]:
# Calcular a AUC
roc_auc_score = (y_test, y_pred[:, 1])
auc = roc_auc_score

print('AUC:', auc)

NameError: ignored

# Testando o modelo

In [None]:
#Lista de códigos de Destino
df['DestWac'].unique()

array([34, 43, 13, 41, 22, 63, 64, 54, 52, 38, 36, 74,  3, 21, 91, 33, 81,
       92, 44, 61, 51, 15, 65, 23, 71, 37, 45, 85, 83,  2, 72, 93, 84, 66,
       82, 87, 42,  5, 35,  4, 88, 73, 67, 14, 62, 11, 86, 53,  1, 12, 16,
       39, 31])

In [None]:
# Função para receber dados de entrada e fazer uma previsão
def prever_atraso(dest_wac, flight_date):

    # Criar um DataFrame com as entradas
    entrada = pd.DataFrame({'DestWac': [dest_wac], 'FlightDate': [flight_date]})

    # Fazer a previsão
    previsão = model.predict(entrada)

    # Interpretar a previsão
    if previsão > 0:
        print('Haverá atraso')
    else:
        print('Não haverá atraso')




In [None]:
# Testar a função
prever_atraso(dest_wac=1, flight_date='2023-07-20')