In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import statsmodels.api as sm
from scripts.utils import save_fig

plt.rcParams["figure.figsize"] = (12, 4)

In [None]:
LAST_YEAR = 2022

In [None]:
pheno_1 = pd.read_excel("data/pheno/pheno_TL.xlsx", index_col=0).rename(
    columns={
        "Pousse de la vigne": "Budburst - Source 1",
        "Première fleur": "Beginning flowering - Source 1",
        "Pleine fleur": "End flowering - Source 1",
        "Premiers grains changés": "Beginning véraison - Source 1",
        "Début vendanges (en Médoc)": "Beginning of the first harvests - Source 1",
        "Pleines vendanges": "Beginning of all harvests - Source 1",
        "Fin vendanges rouges": "End harvest - Source 1",
    }
)
pheno_1 = pheno_1.iloc[:, :7]
cols = list(pheno_1.columns)
years = list(pheno_1.index)
pheno_1_plot = pheno_1.copy()

for col in cols:
    pheno_1[col] = pd.to_datetime(pheno_1[col], errors="coerce")
    for year in years:
        try:
            pheno_1.loc[year, col] = (
                pheno_1.loc[year, col].replace(year=year).normalize()
            )
        except:
            continue
    pheno_1_plot[col] = pheno_1[col].dt.dayofyear

In [None]:
pheno_2 = pd.read_excel("data/pheno/pheno_latour.xlsx", index_col=0).rename(
    columns={
        "debourrement": "Budburst - Château Latour",
        "half-veraison": "Half-véraison - Château Latour",
        "half-vendanges": "Half-harvest - Château Latour",
        "begin vendanges": "Beginning harvest - Château Latour",
        "end vendanges": "End harvest - Château Latour",
    }
)
pheno_2["Half-flowering - Château Latour"] = (
    pheno_2["half-floraison M"]
    + (pheno_2["half-floraison CS"] - pheno_2["half-floraison M"]) / 2
)
pheno_2 = pheno_2.drop(["half-floraison M", "half-floraison CS"], axis=1)
# pheno_2['floraison'] = pheno_2['mi-floraison M']
pheno_2_plot = pheno_2.copy()
for col in pheno_2.columns:
    pheno_2[col] = pd.to_datetime(pheno_2[col], errors="coerce")
    for year in years[11:]:
        try:
            pheno_2.loc[year, col] = (
                pheno_2.loc[year, col].replace(year=year).normalize()
            )
        except:
            continue
    pheno_2_plot[col] = pheno_2[col].dt.dayofyear

In [None]:
pheno_inrae = pd.read_csv(
    "data/pheno/pheno_INRAE/phenoclim_agroclim_inrae.csv", sep=";", encoding="latin1"
).drop("source_donnees", axis=1)
pheno_inrae.date = pd.to_datetime(pheno_inrae.date)
pheno_inrae["dayofyear"] = pheno_inrae["date"].dt.dayofyear
pheno_inrae["year"] = pheno_inrae["date"].dt.year

pheno_inrae = pheno_inrae[
    [
        "date",
        "year",
        "dayofyear",
        "nom_site",
        "echelle_phenologique",
        "description_evenement_phenologique_majeur",
        "variete",
        "code_stade_phenologique",
        "description_stade_phenologique",
        "degats_gel_de_printemps",
    ]
]
pheno_inrae = pheno_inrae[
    (pheno_inrae["nom_site"].isin(["INRA Bordeaux - Villenave d'Ornon"]))
    & (pheno_inrae["variete"].isin(["Merlot", "Cabernet-Sauvignon"]))
].drop("nom_site", axis=1)

In [None]:
pheno_inrae = (
    pheno_inrae.groupby(["year", "description_evenement_phenologique_majeur"])[
        ["dayofyear"]
    ]
    .mean()
    .reset_index()
)

In [None]:
plt.figure(figsize=(12, 6))

sns.lineplot(data=pheno_1_plot, palette="colorblind", dashes=False)

sns.lineplot(
    data=pheno_inrae,
    x="year",
    y="dayofyear",
    hue="description_evenement_phenologique_majeur",
    palette=["green", "orange", "brown"],
)

palette = sns.color_palette("viridis", 6)
sns.lineplot(data=pheno_2_plot, palette=palette)
plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.ylabel("Day of year")
plt.xlabel("Vintage")
plt.tight_layout()
plt.gca().invert_yaxis()
handles, labels = plt.gca().get_legend_handles_labels()
# order = [0, 7, 1, -1, 2, 3, 8, 4, -3, 5, -4, -2, 6]
plt.legend(
    # [handles[idx] for idx in order],
    # [labels[idx] for idx in order],
    loc="upper left",
    bbox_to_anchor=(1, 1),
)
plt.tight_layout()
plt.savefig("views/pheno_both_sources.png")
plt.show()

From the above chart, several remarks:

- Debourrement dates seem incertain. TL debourrement should always be first, but it is not. Therefore the debourrement will be averaged between the two sources.
- "mi-floraison" in Source 2 seems to correspond to "pleine fleur" in Source 1. The latest date will be chosen for the "end floraison" parameter
- Missing values in begin and half veraison will be obtained by substracting or adding to the present value the mean difference between series.
- Latest and earliest records of any vendanges event will be kept for begin and end of vendanges

In [None]:
pheno_1 = pheno_1.loc[1950:LAST_YEAR]
pheno_2 = pheno_2.loc[1950:LAST_YEAR]

In [None]:
year_range = range(1950, LAST_YEAR+1)
pheno_df = pd.DataFrame(np.zeros(len(year_range)), index=year_range).iloc[:, 1:]

### Budburst
# Calculate mean
pheno_df["Budburst"] = (
    pheno_1["Budburst - Source 1"]
    + (pheno_2["Budburst - Château Latour"] - pheno_1["Budburst - Source 1"]) / 2
)
# Fill missing values with either only source 1 or only source 2
pheno_df["Budburst"] = (
    pheno_df["Budburst"]
    .fillna(pheno_2["Budburst - Château Latour"])
    .fillna(pheno_1["Budburst - Source 1"])
    .dt.normalize()
)


### Floraison
pheno_df["Beginning flowering"] = pheno_1["Beginning flowering - Source 1"]

# For the end: get the latest known date between end floraison and half-floraison
pheno_df["End flowering"] = np.nan
for year in pheno_df.index[11:]:
    try:
        date_1 = pheno_1.loc[year, "End flowering - Source 1"]
        date_2 = pheno_2.loc[year, "Half-flowering - Château Latour"]
        pheno_df.loc[year, "End flowering"] = max(date_1, date_2)
    except:
        pheno_df.loc[year, "End flowering"] = pheno_2.loc[year, "Half-flowering - Château Latour"]
pheno_df["End flowering"] = (
    pheno_df["End flowering"]
    .fillna(pheno_1["End flowering - Source 1"])
    .fillna(pheno_2["Half-flowering - Château Latour"])
    .dt.normalize()
)

pheno_df["Beginning flowering"] = pheno_df["Beginning flowering"].fillna((
    pheno_df["End flowering"] + (pheno_df["Beginning flowering"] - pheno_df["End flowering"]).mean()
))

### Veraison
average_veraison_delay = np.mean(
    pheno_2["Half-véraison - Château Latour"] - pheno_1["Beginning véraison - Source 1"]
)
pheno_df["Beginning véraison"] = (
    pheno_1["Beginning véraison - Source 1"]
    .fillna(pheno_2["Half-véraison - Château Latour"] - average_veraison_delay)
    .dt.normalize()
)
pheno_df["Half véraison"] = pheno_2["Half-véraison - Château Latour"]
pheno_df["Half véraison"] = (
    pheno_df["Half véraison"]
    .fillna(pheno_1["Beginning véraison - Source 1"] + average_veraison_delay)
    .dt.normalize()
)


### Vendanges
# For the begin: get the earliest known date between the two sources
pheno_df["Beginning harvest"] = np.nan

for year in pheno_df.index[11:]:
    try:
        date_1 = pheno_1.loc[year, "Beginning of the first harvests - Source 1"]
        date_2 = pheno_2.loc[year, "Beginning harvest - Château Latour"]
        pheno_df.loc[year, "Beginning harvest"] = min(date_1, date_2)
    except:
        pheno_df.loc[year, "Beginning harvest"] = pheno_2.loc[year, "Beginning harvest - Château Latour"]
pheno_df["Beginning harvest"] = pheno_df["Beginning harvest"].fillna(
    pheno_1["Beginning of all harvests - Source 1"]
)

# pheno_df['Beginning harvest'] = pheno_df['Beginning harvest'].dt.normalize()
# For the end: get the latest known date between the two sources
pheno_df["End harvest"] = np.nan
for year in pheno_df.index[11:]:
    try:
        date_1 = pheno_1.loc[year, "End harvest - Source 1"]
        date_2 = pheno_2.loc[year, "End harvest - Château Latour"]
        pheno_df.loc[year, "End harvest"] = max(date_1, date_2)
    except:
        pheno_df.loc[year, "End harvest"] = pheno_2.loc[year, "End harvest - Château Latour"]
pheno_df["End harvest"] = (
    pheno_df["End harvest"]
    .fillna(pheno_1["End harvest - Source 1"])
    .fillna(pheno_2["End harvest - Château Latour"])
    .dt.normalize()
)

In [None]:
pheno_df_plot = pheno_df.copy()
for col in pheno_df_plot.columns:
    pheno_df_plot[col] = pheno_df_plot[col].dt.dayofyear
font = {'weight' : 'regular',
        'family': "serif",
        'size'   : 16}
plt.rc('font', **font)

plt.figure(figsize=(10,5))
plt.grid(True, which='both', axis='both', alpha=0.5)

sns.lineplot(
    data=pheno_df_plot.drop("Beginning véraison", axis=1).rename(columns={"Beginning flowering": "First flowers"}),
    markers=["s", "o", "s", "s", "o", "s"],
    dashes=False,
    palette="colorblind",
    markersize=6,
    linewidth=1.5,
    markeredgecolor="k",
)

plt.xlim(1947, LAST_YEAR)
plt.xlabel("Year")
plt.ylabel("Day of year")

handles, labels = plt.gca().get_legend_handles_labels()
legend = plt.legend(
    reversed(handles),
    reversed(labels),
    bbox_to_anchor=(1,0.99),
    loc='upper left',
    frameon=False,
    labelspacing=2,
    handletextpad=0.8,
    #ncol=3,
    fontsize=15,
    handlelength=0,
)

save_fig('views/pheno_full', "single")
plt.show()

In [None]:
pheno_df['Budburst - flowering interval'] = (pheno_df['Beginning flowering'] - pheno_df['Budburst']).dt.days
pheno_df['Flowering - véraison interval'] = (pheno_df['Half véraison'] - pheno_df['End flowering']).dt.days
pheno_df['Véraison - harvest interval'] = (pheno_df['Beginning harvest'] - pheno_df['Half véraison']).dt.days
pheno_df['Growing season duration'] = (pheno_df['Beginning harvest'] - pheno_df['Budburst']).dt.days
display(pheno_df.tail())

pheno_df.to_excel('data/pheno/generated_pheno.xlsx')

# Display average dates

In [None]:
dayofyears = pheno_df[
    [
        "Budburst",
        "Beginning flowering",
        "End flowering",
        "Beginning véraison",
        "Half véraison",
        "Beginning harvest",
        "End harvest",
    ]
].apply(lambda column: column.dt.dayofyear, axis=1)
dayofyears.describe()

In [None]:
pd.to_datetime(dayofyears.loc[1960:LAST_YEAR].mean(), unit='D', origin=datetime.datetime(year=2001, month=1, day=1))

In [None]:
for column in dayofyears.columns:
    X = sm.add_constant(dayofyears.index.values.reshape(-1,1))
    y = dayofyears[column]
    print(column.upper(), sm.OLS(y, X).fit().summary(), "\n\n\n\n")

---
# End of notebook