# Modeling Pipeline (Experimental)

## Environment Setup

### Imports

In [0]:
from typing import List, Dict, Tuple, Any, Union,Callable
import numpy as np
import random
from datetime import timedelta

import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    MultilayerPerceptronClassifier
)
from xgboost.spark import SparkXGBClassifier
import mlflow
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, MinMaxScaler

### Data and Variables

In [0]:
# Variables and directories
data_BASE_DIR = "dbfs:/mnt/mids-w261/datasets_final_project_2022"
team_BASE_DIR = f"dbfs:/student-groups/Group_4_1"
spark.sparkContext.setCheckpointDir(f"{team_BASE_DIR}/checkpoints")
period = "" # one of the following values ("", "_3m", "_6m", "_1y")
k = 5 # cv folds
overlap = 0.2 # cv overlap

# Datasets
# df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined{period}_cleaned_engineered_timefeat.parquet")
df = spark.read.parquet(f"{team_BASE_DIR}/interim/join_checkpoints/joined_{period}_timefeat_seasfeat_cleaned_pr_v2.parquet")

In [0]:
# Directory Inspection
display(dbutils.fs.ls(f"{team_BASE_DIR}/interim/join_checkpoints/"))

## STEP : Features Selection and Preperation

In [0]:
# weather columns
weather_cols = [col for col in df.columns if "origin_Hourly" in col]
remove_me = ["origin_HourlyPresentWeatherType","origin_HourlySkyConditions","origin_HourlyWindDirection"]
num_weather_cols = [c for c in weather_cols if c not in remove_me]

# seasonality columns
seasonality_cols = ["daily","weekly","yearly","holidays"]

# time columns
time_cols = ["mean_dep_delay","prop_delayed"]

# date related columns
date_cols = ["YEAR","MONTH","DAY_OF_MONTH","DAY_OF_WEEK"]

# flight metadata
flight_metadata_cols = ["OP_UNIQUE_CARRIER","ORIGIN_ICAO","DEST_ICAO"]

# prior & current flight cols
num_flight_cols = ['turnaround_time_calc', 
                   'priorflight_depdelay_calc',
                   'DISTANCE',
                   'CRS_ELAPSED_TIME',
                ]

bool_flight_cols = ['priorflight_isdeparted', 
                    'priorflight_isarrived_calc',
                    'priorflight_isdelayed_calc',
                    'priorflight_cancelled_true']

# graph columns
graph_cols = ["pagerank"]

# fields that will not be features but need to be kept for processing
keep_me = ["outcome","sched_depart_utc"]

########## Define columns to be used as numeric and categorical features in the pipeline ##########
numeric_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
categorical_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

features = numeric_cols + categorical_cols

# features = numeric_cols + categorical_cols
label = "outcome"

## STEP : Time-series CV split

In [0]:
def train_test_split_timeseries(df, time_col: str, test_start: str, test_stop: str="2100-01-01", verbose: bool = True):
    """
    Splits a PySpark DataFrame into a train/test set based on a timestamp column.
    The most recent `test_fraction` of the data (by time) is used as test set.

    Args:
        df (DataFrame): Input PySpark DataFrame.
        time_col (str): Timestamp column name (must be sortable).
        test_start (str): Minimum date for the test set.
        verbose (bool): Print boundaries and sizes.

    Returns:
        (train_df, test_df): Tuple of train and test DataFrames.
    """
    # Filter df to before max date
    df = df.filter(F.col(time_col) < test_stop)

    # Get min and max time
    min_time, max_time = df.selectExpr(f"min({time_col})", f"max({time_col})").first()
    total_days = (max_time - min_time).days

    train_df = df.filter(F.col(time_col) < test_start)
    test_df = df.filter(F.col(time_col) >= test_start)

    if verbose:
        print(f"📅 Total date range: {min_time.date()} → {max_time.date()} ({total_days} days)")
        print(f"✅ Train: {min_time.date()} → {test_start} ({train_df.count():,} rows)")
        print(f"🧪 Test: {test_start} → {max_time.date()} ({test_df.count():,} rows)")

    return train_df, test_df


In [0]:
def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  delay_count = train_df.filter(F.col("outcome") == 1).count()
  non_delay_count = train_df.filter(F.col("outcome") == 0).count()

  total = delay_count + non_delay_count
  keep_percent = delay_count / non_delay_count
  
  train_delay = train_df.filter(F.col('outcome') == 1)
  train_non_delay = train_df.filter(F.col('outcome') == 0).sample(withReplacement=False,fraction=keep_percent,seed=42)
  train_downsampled = train_delay.union(train_non_delay)
  return train_downsampled


In [0]:
def time_series_cv_folds(
    df,
    time_col: str,
    k: int=3,
    blocking: bool=False,
    overlap: float=0.0,
    keep_me: list=df.columns, # defines variables to keep
    verbose: bool=False
):
    """
    Split a time-series PySpark DataFrame into k train/test folds with optional overlap and blocking.
    
    Args:
        df (DataFrame): PySpark DataFrame with a timestamp column.
        dep_utc_time_colvarname (str): Name of the timestamp column.
        k (int): Number of folds.
        blocking (bool): Whether to block the training set to avoid cumulative data.
        overlap (float): Fraction of overlap between validation windows (e.g. 0.2 = 20% overlap).
        verbose (bool): Whether to print the time splits.
        
    Returns:
        List of (train_df, val_df) tuples.
    """
    # Get time boundaries
    min_date = df.select(F.min(time_col)).first()[0]
    max_date = df.select(F.max(time_col)).first()[0]
    n_days = (max_date - min_date).days + 1

    # Adjust chunk sizing
    total_width = k + 1 - overlap * (k - 1)
    chunk_size = int(np.ceil(n_days / total_width))

    if verbose:
        print(f"Splitting data into {k} folds with {overlap*100:.0f}% overlap")
        print(f"Min date: {min_date}, Max date: {max_date}")
        print(f"{chunk_size:,} days per fold")
        print("************************************************************")

    folds = []
    for i in range(k):
        # define indices based on chunk size and overlap
        if i == 0:
            train_start_offset = 0
            train_end_offset = chunk_size
        else:
            train_start_offset += np.ceil((1-overlap)*chunk_size)
            train_end_offset += np.floor((1-overlap)*chunk_size)
        val_start_offset = train_end_offset
        val_end_offset = val_start_offset + chunk_size

        # # Offset calculation with overlap
        # train_start_offset = 0 if not blocking else int(i * (1 - overlap) * chunk_size)
        # train_end_offset = int((i + 1) * chunk_size)
        # val_start_offset = train_end_offset
        # val_end_offset = int(val_start_offset + chunk_size)

        # Compute actual timestamps
        train_start = min_date + timedelta(days=train_start_offset)
        train_end = min_date + timedelta(days=train_end_offset)
        val_start = min_date + timedelta(days=val_start_offset)
        val_end = min_date + timedelta(days=val_end_offset)

        if val_start >= max_date:
            break
        if val_end > max_date:
            val_end = max_date + timedelta(days=1)

        # Apply filters
        train_df = downsample( \
            df.filter((F.col(time_col) >= train_start) & (F.col(time_col) < train_end)))
        val_df = df.filter((F.col(time_col) >= val_start) & (F.col(time_col) < val_end))

        # handle fold-specific variables
        train_df = train_df \
            .withColumnRenamed(f"daily_{i}","daily") \
            .withColumnRenamed(f"weekly_{i}","weekly") \
            .withColumnRenamed(f"yearly_{i}","yearly") \
            .withColumnRenamed(f"holidays_{i}","holidays") \
            .withColumnRenamed(f"train_{i}","pagerank")
        train_df = train_df.fillna({col:0 for col in \
            ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        val_df = val_df \
            .withColumnRenamed(f"daily_{i}","daily") \
            .withColumnRenamed(f"weekly_{i}","weekly") \
            .withColumnRenamed(f"yearly_{i}","yearly") \
            .withColumnRenamed(f"holidays_{i}","holidays") \
            .withColumnRenamed(f"train_{i}","pagerank")
        val_df = val_df.fillna({col:0 for col in \
            ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
        
        train_df = train_df.select(*keep_me)
        val_df = val_df.select(*keep_me)

        if verbose:
            print(f"Fold {i + 1}:")
            print(f"  TRAIN: {train_start.date()} → {train_end.date()} ({train_df.count():,} rows)")
            print(f"  VAL:   {val_start.date()} → {val_end.date()} ({val_df.count():,} rows)")
            print("------------------------------------------------------------")

        folds.append((train_df, val_df))

    return folds

In [0]:
def cv_eval(preds, metric):
  """
  Input: transformed df with prediction and label
  Output: desired score 
  """
  rdd_preds_m = preds.select(['prediction', 'outcome']).rdd
  rdd_preds_b = preds.select('outcome','probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['outcome'])))
  metrics_m = MulticlassMetrics(rdd_preds_m)
  metrics_b = BinaryClassificationMetrics(rdd_preds_b)
  F2 = np.round(metrics_m.fMeasure(label=1.0, beta=2.0), 4)
  pr = metrics_b.areaUnderPR
  if metric == "F2":
    return F2
  else:
      return pr

In [0]:
def model_tuner(
    model,
    stages,
    folds: List[Tuple[DataFrame, DataFrame]],
    mlflow_run_name: str = "/Users/m.bakr@berkeley.edu/flight_delay_tuning",
    metric: str = "F2",
    verbose: bool = True
) -> Dict[str, Union[float, str, Dict[str, Any]]]:
    """
    Universal tuning function for PySpark classification models using time-series cross-validation.

    Args:
        model_name (str): One of ['logreg', 'rf', 'mlp', 'xgb']
        model_params (Dict[str, Any]): Parameters to apply to the model
        folds (List of (train_df, val_df)): Time-aware CV folds
        mlflow_run_name (str): Optional MLflow parent run name
        verbose (bool): Whether to log outputs during tuning

    Returns:
        Dict with best average F2 or pr score, model name, and parameters
    """

    # # Model factory
    # model_factory = {
    #     "logreg": LogisticRegression,
    #     "rf": RandomForestClassifier,
    #     "mlp": MultilayerPerceptronClassifier,
    #     "xgb": SparkXGBClassifier
    # }

    # assert model_name in model_factory, f"Unsupported model: {model_name}"

    # ModelClass = model_factory[model_name]

    # # Apply required fields
    # model = ModelClass(
    #     featuresCol="features_final",
    #     labelCol=label,
    #     # weightCol="weight",  # Handles imbalance
    #     **model_params
    # )

    pipeline = Pipeline(stages=stages + [model]) 

    scores = []

    for i, (train_df, val_df) in enumerate(folds):
        fitted_model = pipeline.fit(train_df)
        preds = fitted_model.transform(val_df)
        score = cv_eval(preds, metric)
        scores.append(score)

        if verbose:
            print(f"[Fold {i+1}] {metric} Score: {score:.4f}")

        mlflow.log_metric(f"{metric}_fold_{i+1}", score)

    avg_score = float(np.mean(scores))
    # mlflow.log_param("model", model_name)
    # mlflow.log_params(model_params)
    mlflow.log_metric("avg_{metric}_score", avg_score)

    if verbose:
        print(f"✅ Average {metric} Score: {avg_score:.4f}")

    return "", avg_score, []

In [0]:


def make_hyperopt_objective(
    model_name: str,
    folds: List[Tuple[DataFrame, DataFrame]],
    stages: List,
    param_space_converter: Callable[[Dict[str, Any]], Dict[str, Any]],
    mlflow_experiment_name: str = "Hyperopt_Universal_Tuning",
    verbose: bool = True
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
    """
    Creates a Hyperopt-compatible objective function for any PySpark classifier.

    Args:
        model_name (str): One of 'logreg', 'rf', 'mlp', 'xgb'.
        folds (List of (train_df, val_df)): Time-series CV folds.
        param_space_converter (Callable): Converts Hyperopt sample into model params.
        mlflow_experiment_name (str): MLflow experiment name.
        verbose (bool): Logging toggle.

    Returns:
        Callable that can be passed as fn to hyperopt.fmin()
    """

    def objective(sampled_params: Dict[str, Any]) -> Dict[str, Any]:
        # Convert sampled param space to Spark-friendly params
        model_params = param_space_converter(sampled_params)

        result = model_tuner(
            model_name=model_name,
            model_params=model_params,
            stages=stages,
            folds=folds,
            mlflow_run_name=f"hyperopt_{model_name}",
            verbose=verbose
        )

        print("!!!")
        print(result)

        return {
            "loss": -result[1],  # Minimize negative F2
            "status": STATUS_OK,
            "params": result[2]
        }

    return objective


## Data Prep

### Get Train/Test and Data for Each CV Fold

In [0]:
train_df, test_df = train_test_split_timeseries(
    df=df,
    time_col="sched_depart_utc",
    test_start="2019-01-01",
    test_stop="2020-01-01",
    verbose=True
)

In [0]:
filter_cols = [*keep_me, *numeric_cols, *categorical_cols]

# Testing Time Series CV function
folds = time_series_cv_folds(
    train_df,
    time_col="sched_depart_utc",
    k=k,
    overlap=overlap,
    blocking=True,
    keep_me=filter_cols,
    verbose=True
)

In [0]:
train_df_samp = downsample(train_df)
train_df_samp.count()

In [0]:
train_df_samp = train_df_samp \
    .withColumnRenamed(f"daily_full","daily") \
    .withColumnRenamed(f"weekly_full","weekly") \
    .withColumnRenamed(f"yearly_full","yearly") \
    .withColumnRenamed(f"holidays_full","holidays") \
    .withColumnRenamed(f"test","pagerank")
train_df_samp = train_df_samp.fillna({col:0 for col in \
    ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})
test_df = test_df \
    .withColumnRenamed(f"daily_full","daily") \
    .withColumnRenamed(f"weekly_full","weekly") \
    .withColumnRenamed(f"yearly_full","yearly") \
    .withColumnRenamed(f"holidays_full","holidays") \
    .withColumnRenamed(f"test","pagerank")
test_df = test_df.fillna({col:0 for col in \
    ['daily','weekly','yearly','holidays','mean_dep_delay','prop_delayed']})

train_df_samp = train_df_samp.select(*filter_cols).cache()
test_df = test_df.select(*filter_cols).cache()

## Experiments

### Define Modeling Pipeline Stages

In [0]:
def make_stages(categorical_cols, numeric_cols):

    # List to hold the stages of the pipeline
    stages = []

    # 1. Index and encode categorical columns
    for column in categorical_cols:
        indexer = StringIndexer(
            inputCol=column, 
            outputCol=column + "_index", 
            handleInvalid="keep"
        )
        encoder = OneHotEncoder(
            inputCol=column + "_index", 
            outputCol=column + "_vec", 
            handleInvalid="keep"
        )
        stages += [indexer, encoder]

    # 4. Update feature list to include imputed columns
    categorical_vec_columns = [col + "_vec" for col in categorical_cols]

    features = numeric_cols + categorical_vec_columns

    # 5. Assemble features
    assembler = VectorAssembler(
        inputCols=features, 
        outputCol="features", 
        handleInvalid="skip"
    )

    # 6. Scale features
    scaler = MinMaxScaler(
        inputCol="features", 
        outputCol="features_final"
    )

    stages += [assembler,scaler]

    return stages

In [0]:
def run_feature_experiment(categorical_cols,numeric_cols,run_name):

    print("Defining model")
    # define RF model with tuned parameters
    model = RandomForestClassifier(
        featuresCol="features_final",
        labelCol=label,
        featureSubsetStrategy="auto",
        maxDepth=10,
        numTrees=20
        )

    print("Defining pipeline stages")
    stages = make_stages(categorical_cols,numeric_cols)
    pipeline = Pipeline(stages=stages + [model])
    metric = "F2"

    print("FITTING ON CV FOLDS")

    with mlflow.start_run(run_name=f"{run_name}_cv"):
        _ , score, _ = model_tuner(
            model=model,
            folds=folds,
            stages=stages,
            mlflow_run_name="/Users/m.bakr@berkeley.edu/flight_delay_tuning_EIL_sandbox",
            verbose=True
        )
        print("F2 Score:", score)

    print("FITTING ON FULL TRAINING DATA")

    with mlflow.start_run(run_name=f"{run_name}_full"):
        fitted_model = pipeline.fit(train_df_samp)
        preds = fitted_model.transform(test_df)
        score = cv_eval(preds, metric)

        print(f"[Held out test] {metric} Score: {score:.4f}")

        mlflow.log_metric(f"{metric}_held_out_test", score)

        mlflow.log_param("model", "rf")
        mlflow.log_metric("{metric}_score", score)

        # Log the model
        mlflow.spark.log_model(fitted_model, "rf_model")

        # Extract prediction and label columns
        prediction_and_labels = preds.select("prediction", label).rdd.map(lambda r: (r[0], r[1]))
        
        # Compute confusion matrix
        metrics = MulticlassMetrics(prediction_and_labels)
        confusion_matrix = metrics.confusionMatrix().toArray()
        
        print("Confusion Matrix:")
        print(confusion_matrix)

        return fitted_model, score, confusion_matrix

# Experiments

In [0]:
# # NORMAL CASE
# num_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
# cat_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

## Experiment: No weather

In [0]:
num_cols = [*seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
cat_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

run_feature_experiment(cat_cols,num_cols,"rf_no_weather")

## Experiment: No seasonality

In [0]:
num_cols = [*num_weather_cols, *time_cols, *num_flight_cols, *graph_cols]
cat_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

run_feature_experiment(cat_cols,num_cols,"rf_no_seas")

## Experiment: No time

In [0]:
num_cols = [*num_weather_cols, *seasonality_cols, *num_flight_cols, *graph_cols]
cat_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

run_feature_experiment(cat_cols,num_cols,"rf_no_time")

## Experiment: No priorflight

In [0]:
num_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *graph_cols]
cat_cols = [*date_cols, *flight_metadata_cols]

run_feature_experiment(cat_cols,num_cols,"rf_no_priorflight")

## Experiment: No graph

In [0]:
num_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols]
cat_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

run_feature_experiment(cat_cols,num_cols,"rf_no_graph")

## Experiment: No date

In [0]:
num_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
cat_cols = [ *flight_metadata_cols, *bool_flight_cols]

run_feature_experiment(cat_cols,num_cols,"rf_no_date")

## Experiment: No metadata

In [0]:
num_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
cat_cols = [*date_cols, *bool_flight_cols]

run_feature_experiment(cat_cols,num_cols,"rf_no_seas")

## Feature Importance

In [0]:
num_cols = [*num_weather_cols, *seasonality_cols, *time_cols, *num_flight_cols, *graph_cols]
cat_cols = [*date_cols, *flight_metadata_cols, *bool_flight_cols]

model = SparkXGBClassifier(
    features_col="features_final",
    label_col=label,
    max_depth=10,
    num_workers=5
)

stages = make_stages(categorical_cols,numeric_cols)
pipeline = Pipeline(stages=stages + [model])
metric = "F2"

fitted_model = pipeline.fit(train_df_samp)

In [0]:
import pandas as pd

va = fitted_model.stages[-3]
tree = fitted_model.stages[-1]
mappings = list(zip(va.getInputCols(), tree.get_feature_importances().keys()))
avg_gain = pd.DataFrame(list(tree.get_feature_importances('gain').items()), columns=['id', 'avg_gain'])
weight = pd.DataFrame(list(tree.get_feature_importances('weight').items()), columns=['id', 'weight'])
avg_cover = pd.DataFrame(list(tree.get_feature_importances('cover').items()), columns=['id', 'avg_cover'])

feature_importance_df = avg_gain.merge(weight, on='id').merge(avg_cover, on='id')
mappings_df = pd.DataFrame(mappings, columns=['name', 'id'])
feature_importance_df = feature_importance_df.merge(mappings_df, on='id')

In [0]:
feature_importance_df

In [0]:
feature_importance_df['idx'] = feature_importance_df.index

In [0]:
feature_importance_df

In [0]:
def get_family(index):
    if 0 <= index <= 8:
        return 'weather'
    elif 9 <= index <= 12:
        return 'seasonality'
    elif 13 <= index <= 14:
        return 'time'
    elif index in [15, 16, 17, 18, 27, 28, 29, 30]:
        return 'curr_prior_flight'
    elif index == 19:
        return 'graph'
    elif 20 <= index <= 23:
        return 'date'
    elif 24 <= index <= 26:
        return 'metadata'
    else:
        return 'unknown'

feature_importance_df['family'] = feature_importance_df['idx'].apply(get_family)

In [0]:
feature_importance_df

In [0]:
DATA = feature_importance_df.groupby('family').agg({
    'weight': 'sum',
    'avg_gain': 'sum',
    'avg_cover': 'sum'
}).reset_index()
DATA.set_index('family', inplace=True)
DATA = DATA.div(DATA.sum(axis=0), axis=1)
DATA = DATA.sort_values(by='weight', ascending=False)
DATA

In [0]:
from matplotlib import pyplot as plt

data = DATA.weight

fig,ax = plt.subplots(1,1,figsize=(10,2))
for idx,v in enumerate(data.index):
    if idx == 0:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "))
    else:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "),left=np.sum(data[data.index[:idx]]))
ax.set_xlabel('Proportion')
ax.set_title(f'Feature Importance by Feature Family: Weight')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=len(data))
plt.show()

In [0]:
from matplotlib import pyplot as plt

data = DATA.avg_gain

fig,ax = plt.subplots(1,1,figsize=(10,2))
for idx,v in enumerate(data.index):
    if idx == 0:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "))
    else:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "),left=np.sum(data[data.index[:idx]]))
ax.set_xlabel('Proportion')
ax.set_title(f'Feature Importance by Feature Family: Average Gain')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=len(data))
plt.show()

In [0]:
from matplotlib import pyplot as plt

data = DATA.avg_cover

fig,ax = plt.subplots(1,1,figsize=(10,2))
for idx,v in enumerate(data.index):
    if idx == 0:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "))
    else:
        ax.barh([''],data[v],edgecolor='black',label=v.replace("_"," "),left=np.sum(data[data.index[:idx]]))
ax.set_xlabel('Proportion')
ax.set_title(f'Feature Importance by Feature Family: Average Cover')
ax.set_xlim((0,1))
ax.legend(loc='upper center',bbox_to_anchor=(0.5,-0.3),ncol=len(data))
plt.show()