In [1]:
# Install dependencies
!pip install -q kagglehub[pandas-datasets]

import kagglehub
from kagglehub import KaggleDatasetAdapter
import os
import pandas as pd

# Download dataset (returns local directory path)
path = kagglehub.dataset_download("shaivyac/us-airline-dataset")

# List files in the dataset to see what’s available
print("Files downloaded:")
print(os.listdir(path))

# Now, load a specific file (replace with actual filename you see printed)
file_to_load = os.path.join(path, "Airline_dataset.csv")  # Corrected filename

# Load the dataset using pandas directly from the local path

df = pd.read_csv(file_to_load)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shaivyac/us-airline-dataset?dataset_version_number=2...


100%|██████████| 22.1M/22.1M [00:00<00:00, 102MB/s] 

Extracting files...





Files downloaded:
['Airline_dataset.csv']


In [6]:
# Install streamlit and pyngrok (tested with pyngrok==5.3.0)
!pip install -q streamlit pyngrok plotly altair pandas numpy


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Download ngrok v3 (stable) and make it executable
!wget -q -O ngrok.zip https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
!unzip -o ngrok.zip
!chmod +x ngrok


Archive:  ngrok.zip
  inflating: ngrok                   


In [4]:
NGROK_AUTH_TOKEN = "351ATjzi6BEW7Errqhrv2uo3bgR_7BwCfCbT9AfAdCEgbrYxp"  # <-- REPLACE this

# Add auth token to the ngrok binary we downloaded
!./ngrok config add-authtoken $NGROK_AUTH_TOKEN


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [5]:
import pickle
# Make sure your dataframe is in the variable `df` (in notebook scope)
# Then run:
with open("df.pkl", "wb") as f:
    pickle.dump(df, f)
print("Saved df -> df.pkl (rows: {})".format(len(df)))


Saved df -> df.pkl (rows: 1204825)


In [7]:
%%writefile app.py
# Full Streamlit Flight Dashboard (multi-page)
# Expects a file "df.pkl" in the same folder (pickled pandas DataFrame).
# Run with: streamlit run app.py

import streamlit as st
import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import plotly.graph_objects as go
import altair as alt

st.set_page_config(layout="wide", page_title="Flight Dashboard (Colab)")

# -------------------------
# Utilities
# -------------------------
def safe_int_cast_col(df, cols):
    for c in cols:
        if c in df.columns:
            try:
                if pd.api.types.is_float_dtype(df[c].dtype):
                    if (df[c].dropna() % 1 == 0).all():
                        df[c] = df[c].astype("Int64")
                    else:
                        df[c] = df[c].round().astype("Int64")
                elif pd.api.types.is_integer_dtype(df[c].dtype):
                    df[c] = df[c].astype("Int64")
            except Exception:
                pass
    return df

def parse_date_column(df):
    candidates = [c for c in df.columns if 'date' in c.lower() or c.upper()=='FL_DATE']
    if not candidates:
        return df, None
    col = candidates[0]
    try:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    except Exception:
        try:
            df[col] = pd.to_datetime(df[col], format='%m/%d/%y', errors='coerce')
        except Exception:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    return df, col

def time_of_day_from_hhmm(x):
    try:
        if pd.isna(x):
            return pd.NaT
        s = str(int(float(x)))
        s = s.zfill(4)
        hh = int(s[:-2])
        mm = int(s[-2:])
        return datetime.time(hour=hh%24, minute=mm)
    except Exception:
        try:
            return pd.to_datetime(x).time()
        except Exception:
            return pd.NaT

def add_derived_columns(df):
    df, date_col = parse_date_column(df)
    if date_col:
        df['__date_col'] = df[date_col]
        df['year'] = df['__date_col'].dt.year
        df['month'] = df['__date_col'].dt.to_period('M').dt.to_timestamp()
        df['day'] = df['__date_col'].dt.date
        df['dow'] = df['__date_col'].dt.day_name()
    else:
        df['year'] = pd.NA
        df['month'] = pd.NA
        df['day'] = pd.NA
        df['dow'] = pd.NA

    id_cols = [c for c in df.columns if c.upper().endswith('_ID') or c.upper().endswith('SEQ_ID')]
    df = safe_int_cast_col(df, id_cols)

    dep_candidates = [c for c in df.columns if c.upper().startswith('DEP_TIME') or c.upper()=='DEP_TIME']
    if dep_candidates:
        depc = dep_candidates[0]
        df['__dep_time_raw'] = df[depc]
        df['__dep_time_time'] = df['__dep_time_raw'].apply(time_of_day_from_hhmm)
        df['dep_hour'] = df['__dep_time_time'].apply(lambda t: t.hour if pd.notna(t) else pd.NA)
    else:
        df['dep_hour'] = pd.NA

    delay_cols = [c for c in df.columns if 'delay' in c.lower()]
    if delay_cols:
        for c in ['ARR_DELAY','DEP_DELAY']:
            if c in df.columns:
                df['delay'] = df[c]
                break
        if 'delay' not in df.columns:
            df['delay'] = df[delay_cols[0]]
    else:
        df['delay'] = pd.NA

    return df

def agg_routes(df, origin_col, dest_col, top_n=20):
    if origin_col is None or dest_col is None:
        return pd.DataFrame(columns=[origin_col, dest_col, 'count'])
    r = df.groupby([origin_col, dest_col]).size().reset_index(name='count').sort_values('count', ascending=False)
    return r.head(top_n)

# -------------------------
# Load data
# -------------------------
st.sidebar.header("Data")
st.sidebar.write("App expects file: df.pkl (pickled pandas DataFrame)")
uploaded = st.sidebar.file_uploader("Upload alternative df.pkl or CSV", type=['pkl','pickle','csv','parquet'])
use_sample = st.sidebar.checkbox("Use small generated sample", value=False)

df = None

if uploaded is not None:
    try:
        if uploaded.name.endswith('.csv'):
            df = pd.read_csv(uploaded)
        elif uploaded.name.endswith('.parquet'):
            df = pd.read_parquet(uploaded)
        else:
            import pickle
            df = pickle.load(uploaded)
        st.sidebar.success(f"Loaded {uploaded.name} with {len(df):,} rows")
    except Exception as e:
        st.sidebar.error(f"Could not load uploaded file: {e}")

if df is None and use_sample:
    df = pd.DataFrame({
        'FL_DATE': pd.date_range('2018-08-01', periods=500, freq='D').astype(str),
        'AIRLINE_ID': np.random.choice([19805, 19806, 30001], size=500),
        'TAIL_NUM': np.random.choice(['N956AN','N973AN','N9006'], size=500),
        'FLIGHT_NUM': np.random.choice([1587,1588,1590,1541], size=500),
        'ORIGIN_AIRPORT': np.random.choice(['JFK','PHX','CLE','EWR','DFW','ATL','ORD'], size=500),
        'DEST_AIRPORT': np.random.choice(['PHX','EWR','DFW','JFK','ATL','ORD'], size=500),
        'DEP_TIME': np.random.choice([649,1649,830,1230,2045,559,2330], size=500),
        'DEP_DELAY': np.random.normal(loc=10, scale=30, size=500)
    })
    st.sidebar.info("Using generated sample dataset")

if df is None:
    # Try to load df.pkl from disk
    try:
        import pickle
        with open("df.pkl", "rb") as f:
            df = pickle.load(f)
        st.sidebar.success("Loaded df.pkl from disk")
    except Exception as e:
        st.warning("No df found. Upload a file, enable sample, or create df.pkl in the runtime.")
        st.stop()

orig_shape = df.shape
df = add_derived_columns(df)
col_names = list(df.columns)

# detect airline & origin/dest columns
airline_col = None
for c in col_names:
    if c.upper() in ('AIRLINE_ID','OP_CARRIER','OP_UNIQUE_CARRIER','UNIQUE_CARRIER','CARRIER'):
        airline_col = c
        break
if airline_col is None:
    for c in col_names:
        if 'AIRLINE' in c.upper() or c.upper().startswith('CARRIER'):
            airline_col = c
            break

origin_col = next((c for c in col_names if c.upper() in ('ORIGIN_AIRPORT','ORIGIN')), None)
dest_col = next((c for c in col_names if c.upper() in ('DEST_AIRPORT','DEST')), None)
if origin_col is None:
    oc = [c for c in col_names if 'ORIGIN' in c.upper()]
    origin_col = oc[0] if oc else None
if dest_col is None:
    dc = [c for c in col_names if 'DEST' in c.upper()]
    dest_col = dc[0] if dc else None

# -------------------------
# Navigation
# -------------------------
st.sidebar.header("Windows")
tab = st.sidebar.radio("Pages:", ['General', 'Airlines', 'Airports'])

# -------------------------
# General
# -------------------------
if tab == 'General':
    st.title("General — Flight Traffic Overview")
    st.markdown(f"Dataset: {orig_shape[0]:,} rows × {orig_shape[1]:,} cols")
    if '__date_col' in df.columns and df['__date_col'].notna().any():
        time_agg = df.groupby(pd.Grouper(key='__date_col', freq='W'))[origin_col].count().reset_index(name='flights')
        c1, c2 = st.columns([2,1])
        with c1:
            st.subheader("Flights over time (weekly)")
            fig = px.line(time_agg, x='__date_col', y='flights', title='Flights per week')
            st.plotly_chart(fig, use_container_width=True)
        with c2:
            st.subheader("Top airlines (by count)")
            if airline_col:
                top_air = df[airline_col].value_counts().head(10).reset_index()
                top_air.columns = ['airline','count']
                st.bar_chart(top_air.set_index('airline')['count'])
            else:
                st.info("No airline column detected")
    else:
        st.info("No date column found to show time series. Ensure FL_DATE or a 'date' column exists.")

    st.subheader("Top routes network (Sankey)")
    if origin_col and dest_col:
        routes = agg_routes(df, origin_col, dest_col, top_n=40)
        if len(routes):
            labels = list(pd.unique(routes[[origin_col,dest_col]].values.ravel()))
            label_idx = {l:i for i,l in enumerate(labels)}
            sources = routes[origin_col].map(label_idx)
            targets = routes[dest_col].map(label_idx)
            values = routes['count']
            sankey = go.Figure(data=[go.Sankey(node=dict(label=labels), link=dict(source=sources, target=targets, value=values))])
            st.plotly_chart(sankey, use_container_width=True)
        else:
            st.info("Not enough route data to draw network")
    else:
        st.info("Origin/destination columns not found to build route network")

    st.subheader("Best airline suggester")
    with st.form('suggester'):
        col1, col2 = st.columns(2)
        origins = sorted(df[origin_col].dropna().unique().astype(str)) if origin_col else []
        dests = sorted(df[dest_col].dropna().unique().astype(str)) if dest_col else []
        pick_origin = col1.selectbox("Origin", options=origins)
        pick_dest = col2.selectbox("Destination", options=dests)
        submitted = st.form_submit_button('Suggest airlines')
    if submitted:
        sub = df[(df[origin_col]==pick_origin) & (df[dest_col]==pick_dest)]
        if sub.empty:
            st.warning("No flights found for that pair")
        else:
            if airline_col:
                agg = sub.groupby(airline_col).agg(
                    avg_delay = ('delay','mean') if 'delay' in sub.columns and pd.api.types.is_numeric_dtype(sub['delay']) else (origin_col,'count'),
                    trips = (origin_col,'count')
                ).reset_index()
                if 'avg_delay' in agg.columns and pd.api.types.is_numeric_dtype(agg['avg_delay']):
                    agg = agg.sort_values(['avg_delay','trips'])
                else:
                    agg = agg.sort_values('trips', ascending=False)
                st.dataframe(agg)
            else:
                st.info("No airline column detected to suggest")

# -------------------------
# Airlines page
# -------------------------
elif tab == 'Airlines':
    st.title("Airlines — performance and patterns")
    if not airline_col:
        st.error("No airline column detected in dataset. Unable to show airlines page.")
        st.stop()

    airline = st.selectbox("Select airline", options=sorted(df[airline_col].dropna().unique().astype(str)))
    sub = df[df[airline_col].astype(str)==str(airline)]
    st.markdown(f"### Selected: **{airline}** — {len(sub):,} flights in dataset")

    c1,c2 = st.columns(2)
    with c1:
        st.subheader("Flights per year")
        if sub['year'].notna().any():
            year_counts = sub.groupby('year').size().reset_index(name='count')
            st.bar_chart(year_counts.set_index('year')['count'])
        else:
            st.info("No year data available")
    with c2:
        st.subheader("Flights per month")
        if sub['month'].notna().any():
            month_counts = sub.groupby('month').size().reset_index(name='count')
            fig = px.bar(month_counts, x='month', y='count', title='Flights per month')
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.info("No month data")

    st.subheader("Average delay")
    if 'delay' in sub.columns and pd.api.types.is_numeric_dtype(sub['delay']):
        st.metric("Avg delay (minutes)", f"{sub['delay'].mean():.2f}")
        hist = alt.Chart(sub).mark_bar().encode(
            alt.X('delay', bin=alt.Bin(maxbins=50)),
            y='count()'
        ).properties(height=200)
        st.altair_chart(hist, use_container_width=True)
    else:
        st.info("No numeric delay column found (e.g., ARR_DELAY or DEP_DELAY)")

    st.subheader("Most common trips (top routes)")
    if origin_col and dest_col:
        top_routes = agg_routes(sub, origin_col, dest_col, top_n=50)
        st.dataframe(top_routes)
    else:
        st.info("Origin/destination columns not found")

# -------------------------
# Airports page
# -------------------------
elif tab == 'Airports':
    st.title("Airports — activity and delays")
    if not origin_col or not dest_col:
        st.error("Origin/destination columns required for Airports page.")
        st.stop()

    airport = st.selectbox("Choose airport", options=sorted(pd.unique(pd.concat([df[origin_col].dropna().astype(str), df[dest_col].dropna().astype(str)]))))

    st.subheader("Flights per year/month/day")
    sel_origin = df[df[origin_col].astype(str)==airport]
    sel_dest = df[df[dest_col].astype(str)==airport]
    st.write(f"As origin: {len(sel_origin):,} flights — as destination: {len(sel_dest):,} flights")

    if sel_origin['year'].notna().any():
        year_counts = sel_origin.groupby('year').size().reset_index(name='count')
        st.bar_chart(year_counts.set_index('year')['count'])
    else:
        st.info("No date/year information to compute yearly counts")

    st.subheader("Common airlines at this airport")
    if airline_col:
        origin_airlines = sel_origin[airline_col].value_counts().head(10).reset_index()
        origin_airlines.columns = ['airline','count']
        st.table(origin_airlines)
    else:
        st.info("No airline column detected")

    st.subheader("Airports with most delays (mean delay)")
    if 'delay' in df.columns and pd.api.types.is_numeric_dtype(df['delay']):
        airport_delay = df.groupby(origin_col)['delay'].mean().reset_index().sort_values('delay', ascending=False).head(10)
        st.table(airport_delay)
        st.markdown("**Note**: attributing delays to weather requires a weather field or an external weather join.")
    else:
        st.info("No delay column found to compute airport delays")

    st.subheader("Airports with most departure delays (count)")
    if 'delay' in df.columns and pd.api.types.is_numeric_dtype(df['delay']):
        delayed = df[df['delay']>0]
        delayed_counts = delayed.groupby(origin_col).size().reset_index(name='delayed_count').sort_values('delayed_count', ascending=False).head(10)
        st.table(delayed_counts)
    else:
        st.info("No numeric delay column")

# -------------------------
# Debug / info
# -------------------------
with st.expander("Debug / Data info"):
    st.write(f"Original shape: {orig_shape}")
    st.write("Detected columns and dtypes:")
    st.write(df.dtypes)
    st.write("Sample rows:")
    st.dataframe(df.head(5))

# End app


Writing app.py


In [8]:
# Run streamlit in the background using nohup
!nohup streamlit run app.py --server.port 8501 > streamlit.log 2>&1 &
print("Streamlit launched (background). Logs written to streamlit.log")


Streamlit launched (background). Logs written to streamlit.log


Close other tunnels

In [11]:
from pyngrok import ngrok

for t in ngrok.get_tunnels():
    try:
        print("Disconnecting:", t.public_url)
        ngrok.disconnect(t.public_url)
    except Exception as e:
        print("Failed to disconnect:", e)

# Optionally kill background ngrok process (if any)
try:
    ngrok.kill()
    print("ngrok killed")
except Exception as e:
    print("ngrok.kill() error (if none running it's ok):", e)






ngrok killed


In [13]:
# if streamlit is already running, stop it first (find and kill)
!ps aux | grep streamlit | grep -v grep
# kill the process by PID if present, e.g.:
# !kill -9 <PID>
# then start it again:
!nohup streamlit run app.py --server.port 8501 > streamlit.log 2>&1 &


root         734  2.8  0.5 233804 71228 ?        S    14:48   0:03 /usr/bin/python3 /usr/local/bin/streamlit run app.py --server.port 8501


Open new

In [15]:
from pyngrok import ngrok
from pyngrok.conf import PyngrokConfig

config = PyngrokConfig(ngrok_path="./ngrok")  # remove if you didn't use custom binary
public_url = ngrok.connect(8501, pyngrok_config=config)
print("Public URL:", public_url)




PyngrokNgrokHTTPError: ngrok client exception, API returned 502: {"error_code":103,"status_code":502,"msg":"failed to start tunnel","details":{"err":"failed to start tunnel: The endpoint 'https://substriated-rick-reunitable.ngrok-free.dev' is already online. Either\n1. stop your existing endpoint first, or\n2. start both endpoints with `--pooling-enabled` to load balance between them.\r\n\r\nERR_NGROK_334\r\n"}}


In [14]:
from pyngrok import ngrok
from pyngrok.conf import PyngrokConfig

# Tell pyngrok to use the ngrok binary we downloaded earlier
config = PyngrokConfig(ngrok_path="./ngrok")

# Open tunnel to port 8501 (streamlit)
public_url = ngrok.connect(8501, pyngrok_config=config)
public_url




PyngrokNgrokHTTPError: ngrok client exception, API returned 502: {"error_code":103,"status_code":502,"msg":"failed to start tunnel","details":{"err":"failed to start tunnel: The endpoint 'https://substriated-rick-reunitable.ngrok-free.dev' is already online. Either\n1. stop your existing endpoint first, or\n2. start both endpoints with `--pooling-enabled` to load balance between them.\r\n\r\nERR_NGROK_334\r\n"}}
