# Greentree LLM Assessment — End-to-End Notebook
Loads `.env`, reads dataset (Excel or CSV), runs pipeline steps, and shows quick views.

In [None]:
%pip -q install -r requirements.txt ipywidgets python-dotenv

## 1) Load data via `.env`

In [None]:
import os, pandas as pd
from dotenv import load_dotenv
load_dotenv(override=True)

data_path = os.getenv("DATA_PATH")
print("DATA_PATH =", data_path)
if not data_path:
    raise ValueError("DATA_PATH not set. Edit your .env.")

if data_path.lower().endswith(".xlsx"):
    df = pd.read_excel(data_path)
else:
    for enc in ["utf-8","utf-8-sig","cp1252","latin-1"]:
        try:
            df = pd.read_csv(data_path, encoding=enc)
            break
        except UnicodeDecodeError:
            continue
print("Rows x Cols:", df.shape)
df.head()

## 2) Import project modules

In [None]:
from pathlib import Path
import sys
sys.path.append(str(Path("src").resolve()))
sys.path.append(str(Path("..").resolve()))

from src.sentiment_pipeline import run_and_save as run_sentiment
from src.eda import run_and_save as run_eda
from src.monthly_scoring import run_and_save as run_monthly
from src.ranking import run_and_save as run_rank
from src.flight_risk import run_and_save as run_risk
from src.trend_regression import run_and_save as run_trend
try:
    from src.ml_sentiment import train as train_ml, predict as predict_ml
    HAS_ML = True
except Exception as e:
    print("ML module not available:", e)
    HAS_ML = False


## 3) Run pipeline steps

In [None]:
sent_df, cols = run_sentiment()
sent_df.head()

In [None]:
_ = run_eda()
print("EDA charts saved to outputs/ and visualizations/")

In [None]:
monthly, _ = run_monthly()
monthly.head()

In [None]:
ranking, _ = run_rank()
ranking.head()

In [None]:
risk, _ = run_risk()
risk.head()

In [None]:
trend, monthly_for_trend = run_trend()
trend

## 4) Optional ML sentiment (sklearn)

In [None]:
if HAS_ML:
    ml_report = train_ml()
    ml_report
else:
    print("Skip — ml_sentiment module not found.")

In [None]:
if HAS_ML:
    ml_out, _ = predict_ml()
    ml_out.head()

## 5) Quick interactive exploration

In [None]:
import matplotlib.pyplot as plt
import ipywidgets as W
from IPython.display import display

df = sent_df.copy()
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["year_month"] = df["date"].dt.to_period("M").astype(str)

btns = W.ToggleButtons(options=["Label Distribution","Compound Histogram","Monthly Avg"], value="Label Distribution")
out = W.Output()

def draw(view):
    out.clear_output()
    with out:
        if view == "Label Distribution":
            df["sentiment_label"].value_counts().sort_index().plot(kind="bar", title=view)
            plt.tight_layout(); plt.show()
        elif view == "Compound Histogram":
            df["compound"].plot(kind="hist", bins=30, title=view)
            plt.tight_layout(); plt.show()
        else:
            if "year_month" in df.columns:
                df.groupby("year_month")["compound"].mean().plot(marker="o", title=view)
                plt.tight_layout(); plt.show()
            else:
                print("No date column available.")

draw(btns.value)
btns.observe(lambda ch: draw(ch["new"]), names="value")
display(btns, out)