In [None]:
import pandas as pd
import gdown
import os

# File ID from Google Drive
file_id = '1BX8Uce2Kj9sFNvyQaoKjoHClW-fK0VOn'

# Construct the Google Drive download URL
url = f'https://drive.google.com/uc?id={file_id}'

# Local path to save the file
destination = 'flights.csv'

# Download the file
print("Downloading file from Google Drive...")
gdown.download(url, destination, quiet=False)
print("Download complete.")

# Check if file exists and has content
if os.path.exists(destination) and os.path.getsize(destination) > 0:
    # Read the CSV file
    try:
        data = pd.read_csv(destination, dtype={'ORIGIN_AIRPORT': str, 'DESTINATION_AIRPORT': str}, low_memory=False)
        print(data.head())  # Display the first few rows to verify
    except Exception as e:
        print(f"Error reading the file: {str(e)}")
else:
    print("File download failed or file is empty.")
    exit()

# Check missing values in all columns
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Print the number of rows and columns
num_rows, num_columns = data.shape
print(f"\nNumber of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

# Keep only relevant columns
columns_to_keep = [
    'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
    'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
    'DEPARTURE_TIME', 'DEPARTURE_DELAY'
]
data = data[columns_to_keep]

# Remove rows with NaNs in required columns
data_cleaned = data.dropna(subset=columns_to_keep)

# Save the new file
cleaned_file_path = 'flights_cleaned.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)

print(f"Data cleaned and saved to {cleaned_file_path}")

# Read the cleaned file
data = pd.read_csv(cleaned_file_path, dtype={'ORIGIN_AIRPORT': str, 'DESTINATION_AIRPORT': str}, low_memory=False)

# Check missing values in all columns again
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Print the number of rows and columns
num_rows, num_columns = data.shape
print(f"\nNumber of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

: 