In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

def prepare_data(df_new):
    # Define input and output features
    input_features = ['Year', 'DOT_ID_Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline', 
                    'OriginAirportID', 'DestAirportID', 'CRSDepTime', 'DepartureDelayGroups', 'TaxiOut', 
                    'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'CRSElapsedTime', 'Flights', 'Distance', 
                    'FirstDepTime', 'TotalAddGTime', 'LongestAddGTime', 'DivAirportLandings', 'DivActualElapsedTime', 
                    'DivArrDelay', 'DivDistance', 'Div1TotalGTime', 'Div1LongestGTime', 'Latitude', 'Longitude', 
                    'LatitudeDest', 'LongitudeDest', 'Airline_Ranking', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 
                    'wdir', 'wspd', 'pres', 'Month', 'Day']
    output_features = ['DepTime', 'DepDelayMinutes', 'ArrTime', 'ArrDelayMinutes', 'Diverted', 'ActualElapsedTime', 
                    'AirTime', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 
                    'CancellationCode_encoded']

    # Separate categorical and continuous columns
    categorical_cols = ['Tail_Number','OriginAirportID', 'DestAirportID']
    continuous_cols = [col for col in input_features if col not in categorical_cols]

    # Handle missing values
    df_new = df_new.copy()
    for col in continuous_cols:
        df_new[col] = df_new[col].fillna(df_new[col].mean())
    for col in categorical_cols:
        df_new[col] = df_new[col].fillna('MISSING')
    for col in output_features:
        df_new[col] = df_new[col].fillna(0)

    # Process categorical features
    label_encoders = {}
    categorical_data = []
    vocab_sizes = []
    
    for col in categorical_cols:
        # Add special tokens for padding and unknown
        unique_values = df_new[col].astype(str).unique()
        vocab_size = len(unique_values) + 2  # +2 for padding and unknown
        vocab_sizes.append(vocab_size)
        
        # Fit label encoder
        le = LabelEncoder()
        encoded_values = le.fit_transform(unique_values)
        label_encoders[col] = le
        
        # Transform data
        transformed = le.transform(df_new[col].astype(str))
        categorical_data.append(transformed)

    # Convert categorical data to tensor format
    categorical_tensor = torch.tensor(np.stack(categorical_data, axis=1), dtype=torch.long)
    
    # Normalize continuous features
    scaler = StandardScaler()
    continuous_tensor = torch.tensor(
        scaler.fit_transform(df_new[continuous_cols].values), 
        dtype=torch.float32
    )
    
    # Convert output features to tensor
    output_tensor = torch.tensor(df_new[output_features].values, dtype=torch.float32)
    
    return continuous_tensor, categorical_tensor, output_tensor, continuous_cols, categorical_cols, vocab_sizes, label_encoders

class MultiTaskFlightDelayModel(nn.Module):
    def __init__(self, continuous_dim, vocab_sizes, hidden_dim, num_outputs, embedding_dim=8):
        super(MultiTaskFlightDelayModel, self).__init__()
        
        # Embeddings for categorical features
        self.embeddings = nn.ModuleList([
            nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            for vocab_size in vocab_sizes
        ])
        
        # Calculate total input dimension
        total_embedding_dim = len(vocab_sizes) * embedding_dim
        total_input_dim = continuous_dim + total_embedding_dim
        
        # Main network layers
        self.network = nn.Sequential(
            nn.Linear(total_input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Task-specific output layers
        self.output_layers = nn.ModuleList([
            nn.Linear(hidden_dim, 1) for _ in range(num_outputs)
        ])

    def forward(self, continuous_x, categorical_x):
        # Process categorical features
        embeddings = []
        for i, embedding_layer in enumerate(self.embeddings):
            embedded = embedding_layer(categorical_x[:, i])
            embeddings.append(embedded)
        
        # Concatenate all embeddings
        embedded_categorical = torch.cat(embeddings, dim=1)
        
        # Concatenate with continuous features
        combined_input = torch.cat([continuous_x, embedded_categorical], dim=1)
        
        # Forward pass through main network
        shared_features = self.network(combined_input)
        
        # Get outputs for each task
        outputs = [layer(shared_features) for layer in self.output_layers]
        return torch.cat(outputs, dim=1)

class FlightDataset(torch.utils.data.Dataset):
    def __init__(self, continuous_data, categorical_data, targets):
        self.continuous_data = continuous_data
        self.categorical_data = categorical_data
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return (
            self.continuous_data[idx],
            self.categorical_data[idx],
            self.targets[idx]
        )

def train_model(model, train_loader, criterion, optimizer, device, num_epochs=20):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (continuous_x, categorical_x, targets) in enumerate(train_loader):
            continuous_x = continuous_x.to(device)
            categorical_x = categorical_x.to(device)
            targets = targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(continuous_x, categorical_x)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')

def evaluate_model(model, test_loader, device):
    model.eval()
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for continuous_x, categorical_x, targets in test_loader:
            continuous_x = continuous_x.to(device)
            categorical_x = categorical_x.to(device)
            outputs = model(continuous_x, categorical_x)
            predictions.append(outputs.cpu().numpy())
            actuals.append(targets.cpu().numpy())
    
    predictions = np.concatenate(predictions)
    actuals = np.concatenate(actuals)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    return rmse

def main(df_new, batch_size=64, hidden_dim=128, learning_rate=0.001, num_epochs=30):
    # Prepare data
    continuous_data, categorical_data, targets, continuous_cols, categorical_cols, vocab_sizes, label_encoders = prepare_data(df_new)
    
    # Create dataset and dataloader
    dataset = FlightDataset(continuous_data, categorical_data, targets)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Setup device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Initialize model
    model = MultiTaskFlightDelayModel(
        continuous_dim=len(continuous_cols),
        vocab_sizes=vocab_sizes,
        hidden_dim=hidden_dim,
        num_outputs=targets.shape[1]
    ).to(device)
    
    # Setup training
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Train and evaluate
    train_model(model, data_loader, criterion, optimizer, device, num_epochs)
    rmse = evaluate_model(model, data_loader, device)
    print(f'Final RMSE: {rmse:.4f}')
    
    return model, label_encoders

# Usage example:

model, label_encoders = main(df_new)

In [4]:
import pandas as pd
df_new=pd.read_csv('/kaggle/input/dataaa/datasetfinal.csv',low_memory=False)
df_new['Month'] = pd.to_datetime(df_new['date_converted']).dt.month
df_new['Day']=pd.to_datetime(df_new['date_converted']).dt.day
df_new['Quarter'] = pd.to_datetime(df_new['date_converted']).dt.quarter
df_new['Week'] = pd.to_datetime(df_new['date_converted']).dt.isocalendar().week

In [5]:
df_new=df_new.drop(columns=['date_converted','Unnamed: 0_x',])

In [10]:
df_new=df_new.drop(columns=['FlightDate','OriginWac','DestWac','TotalAddGTime', 'LongestAddGTime', 'DivAirportLandings', 'DivActualElapsedTime', 
                    'DivArrDelay', 'DivDistance', 'Div1TotalGTime', 'Div1LongestGTime'])

In [11]:
df_new.columns

Index(['Year', 'DOT_ID_Reporting_Airline', 'Tail_Number',
       'Flight_Number_Reporting_Airline', 'OriginAirportID', 'OriginCityName',
       'DestAirportID', 'DestCityName', 'CRSDepTime', 'DepTime',
       'DepDelayMinutes', 'DepartureDelayGroups', 'TaxiOut', 'WheelsOff',
       'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelayMinutes',
       'Diverted', 'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights',
       'Distance', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
       'LateAircraftDelay', 'FirstDepTime', 'Latitude', 'Longitude',
       'LatitudeDest', 'LongitudeDest', 'Airline_Ranking',
       'CancellationCode_encoded', 'index', 'tavg', 'tmin', 'tmax', 'prcp',
       'snow', 'wdir', 'wspd', 'pres', 'Month', 'Day', 'Quarter', 'Week'],
      dtype='object')

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

def prepare_data(df_new):
    # Define input and output features
    input_features = ['Year', 'DOT_ID_Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline', 
                    'OriginAirportID','OriginCityName','DestAirportID','DestCityName', 'CRSDepTime', 'TaxiOut', 
                    'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'CRSElapsedTime', 'Flights', 'Distance', 
                      'Latitude', 'Longitude',  'LatitudeDest', 'LongitudeDest', 'Airline_Ranking', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 
                    'wdir', 'wspd', 'pres', 'Month', 'Day','Quarter','Week']
    output_features = ['DepTime', 'DepDelayMinutes','DepartureDelayGroups', 'ArrTime', 'ArrDelayMinutes', 'Diverted', 'ActualElapsedTime', 
                    'AirTime', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 
                    'CancellationCode_encoded']

    # Separate categorical and continuous columns
    categorical_cols = ['Tail_Number','OriginCityName', 'DestCityName']
    continuous_cols = [col for col in input_features if col not in categorical_cols]

    # Handle missing values
    df_new = df_new.copy()
    for col in continuous_cols:
        df_new[col] = df_new[col].fillna(df_new[col].mean())
    for col in categorical_cols:
        df_new[col] = df_new[col].fillna('MISSING')
    for col in output_features:
        df_new[col] = df_new[col].fillna(0)

    # Process categorical features
    label_encoders = {}
    categorical_data = []
    vocab_sizes = []
    
    for col in categorical_cols:
        # Add special tokens for padding and unknown
        unique_values = df_new[col].astype(str).unique()
        vocab_size = len(unique_values) + 2  # +2 for padding and unknown
        vocab_sizes.append(vocab_size)
        
        # Fit label encoder
        le = LabelEncoder()
        encoded_values = le.fit_transform(unique_values)
        label_encoders[col] = le
        
        # Transform data
        transformed = le.transform(df_new[col].astype(str))
        categorical_data.append(transformed)

    # Convert categorical data to tensor format
    categorical_tensor = torch.tensor(np.stack(categorical_data, axis=1), dtype=torch.long)
    
    # Normalize continuous features
    scaler = StandardScaler()
    continuous_tensor = torch.tensor(
        scaler.fit_transform(df_new[continuous_cols].values), 
        dtype=torch.float32
    )
    
    # Convert output features to tensor
    output_tensor = torch.tensor(df_new[output_features].values, dtype=torch.float32)
    
    return continuous_tensor, categorical_tensor, output_tensor, continuous_cols, categorical_cols, vocab_sizes, label_encoders

class MultiTaskFlightDelayModel(nn.Module):
    def __init__(self, continuous_dim, vocab_sizes, hidden_dim, num_outputs, embedding_dim=8):
        super(MultiTaskFlightDelayModel, self).__init__()
        
        # Embeddings for categorical features
        self.embeddings = nn.ModuleList([
            nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            for vocab_size in vocab_sizes
        ])
        
        # Calculate total input dimension
        total_embedding_dim = len(vocab_sizes) * embedding_dim
        total_input_dim = continuous_dim + total_embedding_dim
        
        # Main network layers
        self.network = nn.Sequential(
            nn.Linear(total_input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Task-specific output layers
        self.output_layers = nn.ModuleList([
            nn.Linear(hidden_dim, 1) for _ in range(num_outputs)
        ])

    def forward(self, continuous_x, categorical_x):
        # Process categorical features
        embeddings = []
        for i, embedding_layer in enumerate(self.embeddings):
            embedded = embedding_layer(categorical_x[:, i])
            embeddings.append(embedded)
        
        # Concatenate all embeddings
        embedded_categorical = torch.cat(embeddings, dim=1)
        
        # Concatenate with continuous features
        combined_input = torch.cat([continuous_x, embedded_categorical], dim=1)
        
        # Forward pass through main network
        shared_features = self.network(combined_input)
        
        # Get outputs for each task
        outputs = [layer(shared_features) for layer in self.output_layers]
        return torch.cat(outputs, dim=1)

class FlightDataset(torch.utils.data.Dataset):
    def __init__(self, continuous_data, categorical_data, targets):
        self.continuous_data = continuous_data
        self.categorical_data = categorical_data
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return (
            self.continuous_data[idx],
            self.categorical_data[idx],
            self.targets[idx]
        )

def train_model(model, train_loader, criterion, optimizer, device, num_epochs=20):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (continuous_x, categorical_x, targets) in enumerate(train_loader):
            continuous_x = continuous_x.to(device)
            categorical_x = categorical_x.to(device)
            targets = targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(continuous_x, categorical_x)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')

def evaluate_model(model, test_loader, device):
    model.eval()
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for continuous_x, categorical_x, targets in test_loader:
            continuous_x = continuous_x.to(device)
            categorical_x = categorical_x.to(device)
            outputs = model(continuous_x, categorical_x)
            predictions.append(outputs.cpu().numpy())
            actuals.append(targets.cpu().numpy())
    
    predictions = np.concatenate(predictions)
    actuals = np.concatenate(actuals)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    return rmse

def main(df_new, batch_size=64, hidden_dim=128, learning_rate=0.001, num_epochs=20):
    # Prepare data
    continuous_data, categorical_data, targets, continuous_cols, categorical_cols, vocab_sizes, label_encoders = prepare_data(df_new)
    
    # Create dataset and dataloader
    dataset = FlightDataset(continuous_data, categorical_data, targets)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Setup device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Initialize model
    model = MultiTaskFlightDelayModel(
        continuous_dim=len(continuous_cols),
        vocab_sizes=vocab_sizes,
        hidden_dim=hidden_dim,
        num_outputs=targets.shape[1]
    ).to(device)
    
    # Setup training
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Train and evaluate
    train_model(model, data_loader, criterion, optimizer, device, num_epochs)
    rmse = evaluate_model(model, data_loader, device)
    print(f'Final RMSE: {rmse:.4f}')
    
    return model, label_encoders

# Usage example:
model, label_encoders = main(train_df)

Using device: cpu
Epoch [1/20], Batch [100/5108], Loss: 325661.9688
Epoch [1/20], Batch [200/5108], Loss: 323567.1875
Epoch [1/20], Batch [300/5108], Loss: 339483.5938
Epoch [1/20], Batch [400/5108], Loss: 290737.0312
Epoch [1/20], Batch [500/5108], Loss: 269453.4062
Epoch [1/20], Batch [600/5108], Loss: 265553.9375
Epoch [1/20], Batch [700/5108], Loss: 243443.4531
Epoch [1/20], Batch [800/5108], Loss: 218849.5781
Epoch [1/20], Batch [900/5108], Loss: 213753.7500
Epoch [1/20], Batch [1000/5108], Loss: 219944.2656
Epoch [1/20], Batch [1100/5108], Loss: 199496.1094
Epoch [1/20], Batch [1200/5108], Loss: 162956.5469
Epoch [1/20], Batch [1300/5108], Loss: 133247.1719
Epoch [1/20], Batch [1400/5108], Loss: 107091.9375
Epoch [1/20], Batch [1500/5108], Loss: 110231.3750
Epoch [1/20], Batch [1600/5108], Loss: 102975.6953
Epoch [1/20], Batch [1700/5108], Loss: 68521.2656
Epoch [1/20], Batch [1800/5108], Loss: 76828.3359
Epoch [1/20], Batch [1900/5108], Loss: 62334.5938
Epoch [1/20], Batch [2000

In [13]:
import pandas as pd


df = df_new 

# Define the split ratio (e.g., 80% train, 20% test)
split_ratio = 0.8
split_index = int(len(df) * split_ratio)

# Split the dataset
train_df = df[:split_index]  # First 80% for training
test_df = df[split_index:]    # Remaining 20% for testing

# Optionally, reset index for both DataFrames
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Check the shape of the splits
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)



Training set shape: (326850, 50)
Testing set shape: (81713, 50)
