In [None]:
# Step 1: Install Dependencies
!pip install kaggle openai pandas tqdm



In [None]:
# Step 2: Upload Kaggle API key (You will be prompted to upload kaggle.json)
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"abubakarcool","key":"3aef593fce7c21b2684f8587a7400338"}'}

In [None]:
# Move kaggle.json to correct directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Step 3: Download dataset from Kaggle
!kaggle datasets download -d gauravduttakiit/flight-distance-predictions

# Unzip dataset
!unzip flight-distance-predictions.zip -d dataset

Dataset URL: https://www.kaggle.com/datasets/gauravduttakiit/flight-distance-predictions
License(s): apache-2.0
Downloading flight-distance-predictions.zip to /content
  0% 0.00/955k [00:00<?, ?B/s]
100% 955k/955k [00:00<00:00, 22.9MB/s]
Archive:  flight-distance-predictions.zip
  inflating: dataset/Dataset/Submission.csv  
  inflating: dataset/Dataset/Test.csv  
  inflating: dataset/Dataset/Train.csv  


In [None]:
# Step 4: Install & Authenticate OpenAI
!pip install --upgrade openai
import openai
import json
import pandas as pd
from tqdm import tqdm

# Set OpenAI API Key
openai.api_key = "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"

Collecting openai
  Downloading openai-1.66.5-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.66.5-py3-none-any.whl (571 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.1/571.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.61.1
    Uninstalling openai-1.61.1:
      Successfully uninstalled openai-1.61.1
Successfully installed openai-1.66.5


In [None]:
# Step 5: Load & Preprocess Dataset
df = pd.read_csv("dataset/Dataset/Train.csv")

# Drop missing values
df = df.dropna()
df = df.head(100) # take only first 100 records and save in df for fine tuning

# Select relevant columns
df = df[[
    "Origin Airport Code", "Destination Airport Code",
    "Origin Latitude", "Origin Longitude",
    "Destination Latitude", "Destination Longitude",
    "Great Circle Distance", "Timezone Difference",
    "Continent Origin", "Continent Destination", "Route Popularity",
    "Flight_Distance"
]]

In [None]:
# Step 6: Convert dataset to OpenAI fine-tuning format
training_data = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = (f"Predict flight distance for: "
              f"Origin: {row['Origin Airport Code']}, "
              f"Destination: {row['Destination Airport Code']}, "
              f"Latitude: {row['Origin Latitude']} -> {row['Destination Latitude']}, "
              f"Longitude: {row['Origin Longitude']} -> {row['Destination Longitude']}, "
              f"Great Circle Distance: {row['Great Circle Distance']}, "
              f"Timezone Difference: {row['Timezone Difference']}, "
              f"Continent: {row['Continent Origin']} -> {row['Continent Destination']}, "
              f"Route Popularity: {row['Route Popularity']}. "
              f"What is the expected flight distance?")

    completion = str(row["Flight_Distance"])  # Convert distance to string

    training_data.append({
        "messages": [
            {"role": "system", "content": "You are an AI that predicts flight distances accurately."},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": completion}
        ]
    })

# Save as JSONL file
jsonl_file = "flight_distance_training_fixed.jsonl"
with open(jsonl_file, "w", encoding="utf-8") as f:
    for entry in training_data:
        json.dump(entry, f, ensure_ascii=False)  # Fix encoding issues
        f.write("\n")

print("✅ Training data saved as:", jsonl_file)

100%|██████████| 100/100 [00:00<00:00, 8485.68it/s]

✅ Training data saved as: flight_distance_training_fixed.jsonl





In [None]:
# Step 7: Upload File to OpenAI (New API method)
upload_response = openai.files.create(
    file=open(jsonl_file, "rb"),
    purpose="fine-tune"
)
file_id = upload_response.id
print(f"✅ File uploaded successfully! File ID: {file_id}")

✅ File uploaded successfully! File ID: file-DoxtjWMA6bSk2dSEjepTRo


In [None]:
# Step 8: Start Fine-Tuning (New API method)
fine_tune_response = openai.fine_tuning.jobs.create(
    training_file=file_id,
    model="gpt-3.5-turbo"
)
fine_tune_id = fine_tune_response.id
print(f"🚀 Fine-tuning started! Fine-tune ID: {fine_tune_id}")

🚀 Fine-tuning started! Fine-tune ID: ftjob-MOCdsYrMBwwNMrRTK5fQuEKF


In [None]:
# Step 9: Monitor Fine-Tuning Progress (New API method)
import time
while True:
    status = openai.fine_tuning.jobs.retrieve(fine_tune_id)
    if status.status in ["succeeded", "failed"]:
        print(f"✅ Fine-tuning completed with status: {status.status}")
        break
    print(f"⏳ Fine-tuning in progress: {status.status}")
    time.sleep(60)

⏳ Fine-tuning in progress: validating_files
⏳ Fine-tuning in progress: validating_files
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
⏳ Fine-tuning in progress: running
✅ Fine-tuning completed with status: succeeded


In [None]:
# Step 10: Get the Fine-Tuned Model ID
fine_tuned_model_id = status.fine_tuned_model
print(f"🎉 Fine-tuning finished! Use model ID: {fine_tuned_model_id}")

🎉 Fine-tuning finished! Use model ID: ft:gpt-3.5-turbo-0125:student::BCgDymVJ


In [None]:
# Step 11: Test Fine-Tuned Model with Sample Data
# query = ("Predict flight distance for: "
#          "Origin: ORG754, Destination: DST883, "
#          "Latitude: -0.28 -> 38.50, Longitude: -43.31 -> 24.02, "
#          "Great Circle Distance: 8076.86, Timezone Difference: 1, "
#          "Continent: South America -> Europe, Route Popularity: 397. "
#          "What is the expected flight distance?")
query = ("Predict flight distance for: "
         "Origin: Abu Dhabi International Airport (AUH), Destination: Islamabad International Airport (ISB), "
         "Latitude: 24.4539 -> 33.6844, Longitude: 54.3773 -> 73.0479, "
         "Great Circle Distance: 2051.75, Timezone Difference: 1, "
         "Continent: Asia -> Asia, Route Popularity: 500. "
         "What is the expected flight distance?")

response = openai.chat.completions.create(
    model=fine_tuned_model_id,
    messages=[
        {"role": "system", "content": "You are an AI that predicts flight distances accurately."},
        {"role": "user", "content": query}
    ]
)

print("📌 Predicted Flight Distance:", response.choices[0].message.content)

📌 Predicted Flight Distance: 2082.852
