In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import math

import matplotlib.pyplot as plt
import numpy as np
from scripts.linear_experiments import linear_test_model, LPM_LS
from scripts.utils import score_models, plot_predictions, save_fig

plt.rcParams['figure.figsize'] = (10, 4)
%load_ext autoreload
%autoreload 2
font = {
    'weight' : 'regular',
    'family': 'serif',
    'size'   : 16
}
plt.rc('font', **font)

In [None]:
LIST = ['ERA5', 'METEO-FRANCE']
WEATHER_SOURCE = LIST[1]

In [None]:
df_test = pd.read_csv(f'data/features/features-{WEATHER_SOURCE}_per_vineyard.csv', index_col=0, header=[0,1], dtype=np.float32)

In [None]:
VINEYARDS = list(df_test.T.index.get_level_values(0).unique())

# Transform prices

In [None]:
predictors = ['WD: flowering - harvest', 'DTR: véraison - harvest', 'P: flowering']

In [None]:
list_dfs = []
for vineyard in VINEYARDS:
    df_extract = df_test.loc[:, vineyard]
    df_extract = df_extract[['0 - Price']+predictors]
    df_extract.loc[:, 'Vintage'] = df_extract.index
    df_extract.loc[:, "Vineyard"] = vineyard
    list_dfs.append(df_extract)
df_extract = pd.concat(list_dfs, axis=0)

In [None]:
df_extract["Vintage"] = df_extract["Vintage"]-1955

In [None]:
fig = px.scatter_3d(
    df_extract,
    x=predictors[0],
    y=predictors[1],
    z=predictors[2],
    color='0 - Price',
    size="Vintage",
    size_max=25,
    opacity=0.7,
    hover_data=predictors+["Vintage"]
)
fig.show()

In [None]:
first_vintage = 1960
last_vintage = 2022
target_variable = '0 - Price'
predictors = ['P: flowering', 'WD: flowering - harvest', "DTR: véraison - harvest"]

# LLS

In [None]:
def kernel(n, i, j, h):
    return 1


model = LPM_LS(kernel, degree=0)
df_results_ols, coeffs = linear_test_model(
    VINEYARDS,
    model,
    "Classical OLS",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
    plot=False,
    fixed_bandwidth=1,
)

In [None]:
BANDWIDTH = 0.375


def gaussian_kernel(n, i, j, h):
    return np.exp(-np.square(np.abs(i - j) / (n * h)))


gaussian_model = LPM_LS(gaussian_kernel, degree=0)
df_results_lpm_k1, coeffs = linear_test_model(
    VINEYARDS,
    gaussian_model,
    "LLS gaussian kernel",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
    plot=False,
    fixed_bandwidth=BANDWIDTH,
)


def exponential_kernel(n, i, j, h):
    return np.exp(-np.abs(i - j) / (n * h))


exponential_model = LPM_LS(exponential_kernel, degree=0)
df_results_lpm_k1_bis, coeffs = linear_test_model(
    VINEYARDS,
    exponential_model,
    "LLS exponential kernel",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
    plot=False,
    fixed_bandwidth=BANDWIDTH,
)

### Flat 3d bar plot

In [None]:
def plot_training(model, color, vintage_examples):
    n_examples = len(vintage_examples)
    fig, axes = plt.subplots(n_examples, figsize=(10, 2.5), sharex=True)
    for test_vintage, offset, ax in zip(
        vintage_examples,
        np.linspace(0, 1, n_examples),
        axes,
    ):
        xs = np.arange(1960, test_vintage + 1)
        bar_heights = [1 for _ in range(len(xs))]
        if model == "LLS":
            print("OKOK")
            bar_heights = [
                exponential_kernel(len(bar_heights), i, len(bar_heights), BANDWIDTH)
                for i in range(len(bar_heights))
            ]
            print(bar_heights)
        bar_heights[-1] = 0.05

        cs = [color] * len(xs)
        cs[-1] = "r"
        ax.bar(xs, bar_heights, color=cs, alpha=0.8)

        ax.set_ylabel("")
        ax.set_yticks([])
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.set_title(
            f"Training for vintage {test_vintage}", x=1.0, y=0.3, ha="left", va="center"
        )

    plt.xlabel("Vintage")
    ax.set_xticks(np.arange(1960, 2015, 1), minor=True)
    ax.set_xticks(np.arange(1960, 2015, 5), minor=False)
    # ax.set_xticklabels(np.arange(1960, 2015, 5), {"rotation": 0, "fontsize": 10})

    plt.savefig(
        f"views/training/{model}.jpg", dpi=300, bbox_inches="tight", transparent="True"
    )
    plt.show()


vintage_examples = [1981, 1994, 2013]
plot_training("LLS", "g", vintage_examples)

In [None]:
sns.lineplot(data=coeffs["Château Trotanoy"][predictors])
plt.axhline(0, color='k')
plt.show()

# Display evolution of coeffs per region

In [None]:
coeffs_df = pd.concat(coeffs.values(), keys=coeffs.keys()).reset_index()
coeffs_df.columns = ['Vineyard', 'Year'] + list(coeffs_df.columns[2:])
coeffs_df = coeffs_df.melt(id_vars=['Vineyard', 'Year'])
coeffs_df = coeffs_df.loc[~(coeffs_df['variable'] == 'intercept')]
coeffs_df = coeffs_df.loc[coeffs_df['Year'] >= 2010]

In [None]:
prices = pd.read_excel('data/prices/prices_per_vineyard.xlsx', index_col=0)
appellations = prices.groupby('Vineyard')['Appellation'].first()

In [None]:
coeffs_df = coeffs_df.merge(appellations, on='Vineyard')

In [None]:
coeffs_df.groupby('Vineyard')['Appellation'].first().value_counts()

In [None]:
fig = px.box(coeffs_df, x='Year', y='value', color='Appellation', facet_col='variable', width=1200)
fig.add_hline(y=0.0)
fig.show()
fig.write_html('views/coefficients.html')

# Compare vs Real Prices

In [None]:
df_results = pd.concat(
    [df_results_ols, df_results_lpm_k1, df_results_lpm_k1_bis], axis=0
)
df_prices = df_test.loc[:, (VINEYARDS, "0 - Price")].droplevel(1, axis=1)

In [None]:
scores = score_models(df_results, df_prices, 1994, 2013)

models = scores.columns
median_scores = scores.reset_index().groupby("Test variable")[models].median()
display(median_scores)
median_scores.reset_index().to_excel(
    f"model_outputs/scores/lls_predictions-{WEATHER_SOURCE}_per_vineyard.xls",
    index=False,
)

In [None]:
from scipy import stats
stats.ttest_ind(
    scores.loc["MAE", "LLS exponential kernel"],
    scores.loc["MAE", "Classical OLS"],
    equal_var=False,
)

In [None]:
WA_rated_vineyards = [
    'Château Angélus',
    'Château Ausone',
    'Château Cheval Blanc',
    'Château Figeac',
    'Château Haut-Brion',
    "Château L'Evangile",
    'Château La Mission Haut-Brion',
    'Château Lafite Rothschild',
    'Château Léoville Las Cases',
    'Château Margaux',
    'Château Montrose',
    'Château Mouton Rothschild',
    'Château Palmer',
    'Château Pape Clément',
    'Château Pavie',
    'Château Trotanoy',
    'Château Trotte Vieille',
    'Petrus',
    'Vieux Château Certan',
    # 'Château Latour' # useless to add since we don't have recent prices
]

spearman = scores.loc["Spearman"]
spearman = spearman[spearman.index.isin(WA_rated_vineyards)]
spearman.median()

In [None]:
scores.reset_index().to_excel(f'model_outputs/scores/lls_predictions-{WEATHER_SOURCE}_per_vineyard.xlsx', index=False)

# Compare recent prices

In [None]:
START_YEAR = 2009

In [None]:
vineyards_best_perf = scores.loc['Spearman', 'LLS gaussian kernel'] #.nlargest(15)

In [None]:
predicted_prices = df_results.loc["LLS gaussian kernel"].loc[
    vineyards_best_perf.index, START_YEAR:
]
current_prices = df_prices.T.loc[vineyards_best_perf.index, START_YEAR:]

In [None]:
current_prices = current_prices.reset_index().melt(
    id_vars="Vineyard", var_name="Year", value_name="Log price"
)
current_prices["source"] = "Prix Idealwine, avril 2023"
predicted_prices = predicted_prices.reset_index().melt(
    id_vars="Vineyard", var_name="Year", value_name="Log price"
)
predicted_prices["source"] = "Prix de long-terme prédit"
complete_df = pd.concat([current_prices, predicted_prices])

### Display true vs predicted prices

In [None]:
fig = px.scatter(
    complete_df,
    x="Year",
    y="Log price",
    color="source",
    facet_col="Vineyard",
    facet_col_wrap=3,
    facet_row_spacing=0.02,
    height=math.ceil(complete_df["Vineyard"].nunique()/3) * 200,
    width=1200,
)
fig = fig.update_xaxes(matches=None)
fig = fig.update_yaxes(matches=None)
fig.write_html("views/compare_recent_prices.html")
fig.show()

In [None]:
difference = (
    predicted_prices.groupby(['Vineyard', 'Year'])['Log price'].first()
     - current_prices.groupby(['Vineyard', 'Year'])['Log price'].first()
)
difference = difference.reset_index()

In [None]:
difference = difference.dropna(how='any', axis=0)
difference['Year']= difference['Year'].astype(int)

In [None]:
import seaborn as sns
plt.figure(figsize=(12, 6))
plt.axhline(0, color='k')
sns.boxplot(data=difference, x='Year', y='Log price', color='grey')
plt.ylabel('Surestimation du prix de long-terme')
plt.tight_layout()
plt.savefig('views/avg_prediction_error.png')

In [None]:
complete_df = complete_df.merge(appellations, on='Vineyard')
difference = difference.merge(appellations, on='Vineyard')

In [None]:
fig = px.box(complete_df, x='Year', y='Log price', color="source", facet_col='Appellation', facet_col_wrap=2, width=900)
fig.write_html('views/price_average.html')
fig.show()

In [None]:
longterm = complete_df.loc[complete_df['source'] == 'Prix de long-terme prédit']

In [None]:
longterm['Log price'] = pd.to_numeric(longterm['Log price'])

In [None]:
top_year = longterm.groupby(['Vineyard']).apply(lambda group: group.loc[group['Log price'].idxmax(), ['Year', 'Appellation']]).reset_index()

In [None]:
top_year.loc[top_year['Appellation']=='Médoc', 'Year'].value_counts()

# Plots

In [None]:
plot_predictions(
    df_results.rename(index={"LLS gaussian kernel":"Local Least Squares"}),
    df_prices,
    1994,
    2013,
    "Local Least Squares",
    set_limits=False
)

## Plot overall fit

In [None]:
df_results_lls = df_results.loc["LLS exponential kernel", :].copy()
df_results_lls = (
    df_results_lls.reset_index()
    .melt(id_vars="Vineyard", var_name="Vintage", value_name="Predicted price")
    .groupby(["Vintage", "Vineyard"])
    .first()
)
df_prices_ex = (
    df_prices.loc[1980:, :]
    .reset_index()
    .melt(id_vars="index", var_name="Vineyard", value_name="True price")
    .rename(columns={"index": "Vintage"})
    .groupby(["Vintage", "Vineyard"])
    .first()
)

In [None]:
df_prices_ex, df_results_lls = np.exp(df_prices_ex), np.exp(df_results_lls)

In [None]:
concat=pd.concat([df_results_lls, df_prices_ex], axis=1).reset_index()
concat.columns = ["Vintage", "Vineyard", "Predicted price", "True price"]
concat = concat[concat["Vintage"].isin(range(1994, 2014))]

In [None]:
prices = pd.read_excel("data/prices/prices_04-2023.xlsx", header=0, index_col=0)
appellation = prices[["Appellation"]]

concat["Appellation"] = concat.apply(
    lambda row: appellation.loc[row["Vineyard"]], axis=1
)

### Fit

In [None]:
means = concat.groupby(["Vineyard", "Appellation"]).mean()
concat_rescaled = concat.copy()
concat_rescaled.loc[:, ["Predicted price", "True price"]] = concat_rescaled.apply(
    lambda row: 100
    * row[["Predicted price", "True price"]]
    / means.loc[(row["Vineyard"], row["Appellation"]), "True price"],
    axis=1,
)

In [None]:
HUE_BY_VINEYARD = True
RESCALED = False

In [None]:
from matplotlib import cm

if HUE_BY_VINEYARD:
    plt.figure(figsize=(5, 5))
else:
    plt.figure(figsize=(6, 5))
plt.rcParams.update({"font.size": 16})
plt.grid(False, which="both", axis="both")
if not RESCALED:
    plt.yscale("log")
    plt.xscale("log")

cmap = cm.get_cmap("coolwarm", 27)

ax = sns.scatterplot(
    data=(concat_rescaled if RESCALED else concat),
    x="True price",
    y="Predicted price",
    hue=("Vineyard" if HUE_BY_VINEYARD else "Vintage"),
    marker="s",
    edgecolor="k",
    legend=False,
    palette=(cmap if not HUE_BY_VINEYARD else "colorblind"),
    alpha=0.7,
    zorder=2,
)
if RESCALED:
    plt.ylabel("Prediction as % of average price")
    plt.xlabel("True price as % of average price")
else:
    plt.ylabel("Predicted price (€)")
    plt.xlabel("True price (€)")
if not HUE_BY_VINEYARD:
    norm = plt.Normalize(concat["Vintage"].min(), concat["Vintage"].max())
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])
    cbar = ax.figure.colorbar(sm)
    cbar.set_ticks(range(1994, 2013, 5))
if not RESCALED:
    bottom_limit, top_limit = 30, 5000
else:
    bottom_limit, top_limit = 20, 170

plt.plot([bottom_limit, top_limit], [bottom_limit, top_limit], color="k", zorder=1)
plt.ylim((bottom_limit, top_limit))
plt.xlim((bottom_limit, top_limit))

plt.gca().set_axisbelow(True)

save_fig(
    f"views/model_outputs/global_fit_per_vineyard_{RESCALED=}_{HUE_BY_VINEYARD=}",
    width_column="1.5",
)
plt.show()

### Rank fit

In [None]:
df_results_kls_rank = (
    df_results.loc["LLS exponential kernel", :].T.rank(ascending=False).T.copy()
)
df_results_kls_rank = (
    df_results_kls_rank.reset_index()
    .melt(id_vars="Vineyard", var_name="Vintage", value_name="Predicted price")
    .groupby(["Vintage", "Vineyard"])
    .first()
)
df_prices_ex_rank = (
    df_prices.loc[1980:, :]
    .rank(ascending=False)
    .reset_index()
    .melt(id_vars="index", var_name="Vineyard", value_name="True price")
    .rename(columns={"index": "Vintage"})
    .groupby(["Vintage", "Vineyard"])
    .first()
)

In [None]:
concat = pd.concat([df_results_kls_rank, df_prices_ex_rank], axis=1).reset_index()
concat.columns = ["Vintage", "Vineyard", "Predicted price", "True price"]
concat = concat[concat["Vintage"] >= 1990]

In [None]:
plt.figure(figsize=(5, 4))
cmap = cm.get_cmap("coolwarm", 27)


plt.grid(False, which="both", axis="both")
ax = sns.scatterplot(
    data=concat,
    x="True price",
    y="Predicted price",
    hue="Vintage",
    marker="s",
    palette=cmap,
    legend=False,
    edgecolor="black",
)

plt.ylabel("Predicted rank")
plt.xlabel("2021 rank")
norm = plt.Normalize(concat["Vintage"].min(), concat["Vintage"].max())
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])

ax.figure.colorbar(sm)

plt.savefig(f"views/model_outputs/global_rank_fit.png", bbox_inches="tight", dpi=300)
plt.show()

---
# End of notebook