In [5]:
# Restructure both forecast data and S2S data into annual data required by the LSTM model
# Output to 04_mergeData, including two folders:
# 01_ECMWF; 02_histdata [mean and similar]

def process_climate_data(data_new, year, T_upper, T_lower, dynamic_features):
    # Select columns
    Tmin_columns = [col for col in data_new.columns if '_Tmin' in col]
    Tmin = data_new[Tmin_columns].values
    Tmean_columns = [col for col in data_new.columns if '_Tmean' in col]
    Tmean = data_new[Tmean_columns].values
    Tmax_columns = [col for col in data_new.columns if '_Tmax' in col]
    Tmax = data_new[Tmax_columns].values
    Pre_columns = [col for col in data_new.columns if '_Pre' in col]
    Pre = data_new[Pre_columns].values
    
    # Calculate date range
    days = Pre.shape[1]
    dates = pd.date_range(start=str(year) + '-01-01', periods=days, freq='D')
    
    # Add year information
    data_new['year'] = year
    
    # Calculate extreme meteorological indicators
    spei_df = spei(dates, Pre, Tmean)
    CDD_df, HDD_df, GDD_df = extreme_temperature(dates, Tmax, Tmin, T_upper, T_lower)
    
    # Aggregate 8-day data
    data_new1 = aggre_8days(dynamic_features, dates, data_new)
    
    # Merge all data
    data_new1 = pd.concat([CDD_df, HDD_df, GDD_df, spei_df, data_new1], axis=1)
    
    return data_new1
    
def find_weeks(forecastDataList, week_dates):
    result = []
    # Iterate through each date in forecastDataList
    for date in forecastDataList:
        # Iterate through week_dates to find the week of the date
        for i in range(len(week_dates) - 1):
            # Check if the date is within the current date range (inclusive of lower bound, exclusive of upper bound)
            if week_dates[i] <= date < week_dates[i + 1]:
                result.append((date, i + 1))  # week 1 corresponds to index 0, so week number is i + 1
                break
        # Handle dates beyond the last date range (i.e., week 46 range)
        else:
            if date >= week_dates[-1]:
                result.append((date, len(week_dates)))  # Last week: week 46
    result = {date: week for date, week in result}
    return result

def update_S2Sandhist_VI(data_S2S_new_all_new, VI_select2, result, years, start_point, harvest_point, outpath_S2S,ii,type):
    # Set index
  #  data_S2S_new_all_new.set_index(['year', 'idJoin'], inplace=True)

    # Filter columns containing VI_select2
    filtered_columns = [col for col in data_S2S_new_all_new.columns if VI_select2 in col]
    data_S2S_VI = data_S2S_new_all_new[filtered_columns].reset_index()

    # Initialize updated DataFrame
    update_VI = pd.DataFrame()

    # Update data year by year
    for year in years:
        week_forecast = result[ii]
        forecast_weeklist = range(week_forecast, harvest_point + 1)
        actual_weeklist = range(start_point, week_forecast)
        
        forecast_weeklist = [f'Week{week}{VI_select2}' for week in forecast_weeklist]
        before_weeklist = [f'Week{week}{VI_select2}' for week in actual_weeklist]

        # Calculate historical mean and forecast mean for the current year
        data_S2S_VI_before = data_S2S_VI[before_weeklist + ['year']].groupby('year').mean()
        data_S2S_VI_forecast = data_S2S_VI[forecast_weeklist + ['year']].groupby('year').mean()
        
        # Extract data for the current year
        current_S2S_VI = data_S2S_VI[data_S2S_VI['year'] == year]

        # Calculate DTW distances
        dtw_distances = {}
        for year1 in years:
            current_S2S_VI_before = data_S2S_VI_before.loc[year]
            if year1 < year: # Only use previous years for forecast
                other_S2S_VI_before = data_S2S_VI_before.loc[year1]
                distance, path = fastdtw(current_S2S_VI_before, other_S2S_VI_before)
                dtw_distances[year1] = distance

        # Find the most similar year
        most_similar_by_dtw = min(dtw_distances, key=dtw_distances.get)
        dataVI_similaryear = data_S2S_VI[data_S2S_VI['year'] == most_similar_by_dtw]

        # Update forecast week data for the current year
        current_S2S_VI[forecast_weeklist] = dataVI_similaryear[forecast_weeklist].values
        current_S2S_VI['year'] = year
        current_S2S_VI['idJoin'] = dataVI_similaryear['idJoin']

        # Merge updated data
        update_VI = pd.concat([update_VI, current_S2S_VI], axis=0)

    # Reset index for the updated data
    update_VI.set_index(['year', 'idJoin'], inplace=True)
    data_S2S_new_all_new[forecast_weeklist] = update_VI[forecast_weeklist].values

    # Save results
    output_path = os.path.join(outpath_S2S, 'data_'+type+'.csv')
    data_S2S_new_all_new.to_csv(output_path)
    print(f"Updated data saved to {output_path}")

    return data_S2S_new_all_new

In [6]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import sys
import os
root_directory = os.getcwd()[0:3]
sys.path.append(root_directory+'\\SCI\\SCI9_1\\01_code')
sys.path.append(r'C:\ProgramData\anaconda3\Lib\site-packages') 
sys.path.append(r'C:\Users\DELL\.conda\envs\myenv\Lib\site-packages') 
sys.path.append(r'C:\Users\DELL\.conda\envs\rasterio_env\Lib\site-packages') 
from functions import spei,extreme_temperature,aggre_8days,extract_dates
from sklearn.metrics import mean_absolute_percentage_error, accuracy_score, roc_auc_score, roc_curve,r2_score,mean_squared_error
from functions import calculate_rrmse1,calculate_rrmse2,calculate_acc,calculate_nrmse,calculate_mare,extract_selected_variables
from fastdtw import fastdtw


VIs =  ['_KNDVI' ,'_EVI','_NDVI']
Cilmate = ['_Pre' ,'_Tmin' ,'_Solar','_Tmean','_Tmax']
Climate_Exogenous  = ['_CDD' ,'_HDD' ,'_GDD','_VPD','_wind_speed','_SPEI'] #'_VPD','_wind_speed',
soil_feature = [ 'SAND','AWC', 'SILT','ORG_CARBON',  'TOTAL_N', 'PH_WATER',  'CEC_SOIL', 'CLAY']
loc_feature = ['elevation', 'lat', 'lon']
Year_feature = ['year'];union_feature = ['idJoin'];
dynamic_features = [ '_KNDVI' ,'_EVI','_NDVI','_Pre' ,'_Tmin' ,'_Solar','_Tmean','_VPD', '_wind_speed' ,'_Tmax']

import warnings
warnings.filterwarnings("ignore")
import ast
from fastdtw import fastdtw
 
# Get current working directory
current_directory = os.getcwd()
print("Current working directory:", current_directory)
 
# Get current folder name
current_folder_name = os.path.basename(current_directory)
print("Current folder name:", current_folder_name)
 
# Get parent folder name
parent_directory = os.path.dirname(current_directory)
parent_folder_name = os.path.basename(parent_directory)
print("Parent folder name:", parent_folder_name)

crop = parent_folder_name;countryID =current_folder_name
# Variables that need to be changed
country = countryID.split('_')[1]
##############Region Configuration#############################################
inpath_dates_other = root_directory + '\\SCI\\SCI9_1\\02_data\\'+crop+'\\'+countryID+'\\'+'01_data'+'\\'+'07_Information'
other_infornamtion = pd.read_csv(os.path.join(inpath_dates_other,'information.txt'), sep=' ', header=None)
startyear,endyear,shp_name = other_infornamtion.iloc[0,0],other_infornamtion.iloc[0,1],other_infornamtion.iloc[0,2]

inputpath_base = root_directory + '\\SCI\\SCI9_1\\02_data\\'+crop+'\\'+countryID


Forecastyear = endyear


years = range(startyear,endyear+1)
regions = ['I']#
Forecastyears = {
    'I': endyear, 
}
# Define temperature thresholds by crop type
if crop == '02_Wheat':
    T_upper = 34
    T_lower = 0
elif crop == '01_Maize':  # Fixed spelling error
    T_upper = 30
    T_lower = 8
elif crop == '03_Rice':
    T_upper = 35
    T_lower = 8    
else:
    T_upper = 30
    T_lower = 10



inputpath_base = root_directory + '\\SCI\\SCI9_1\\02_data\\'+crop+'\\'+countryID+'\\'
institution = 'ECMWF';ECMWF_path = os.path.join(inputpath_base,'02_S2S')

file_path = os.path.join(inputpath_base, '02_S2S', '01_dataori', 'ECMWF','CommonYear_Week.txt')
with open(file_path, 'r') as file:
    lines = [line.strip() for line in file.readlines()]
    

Current working directory: F:\SCI\SCI9_1\01_code\02_Wheat\06_India
Current folder name: 06_India
Parent folder name: 02_Wheat


In [7]:
startyear

2001

In [8]:
'''
[20250101]
# 1. Historical data; modify its header
[20250106]
# 1. Fixed minor issues in the case of start_point < harvest_point, mainly incorrect replacement of before data
# 2. Added handling for start_point > harvest_point (i.e., cross-year scenarios)
'''


for region in regions:
    Forecastyear = Forecastyears[region]
    hist_outputpath = os.path.join(inputpath_base,'02_S2S','03_outputData','02_histdata',region)
    os.makedirs(hist_outputpath,exist_ok=True)
    pre_name = 'Wheat_'+region+'_';
    hist_inputpath = os.path.join(inputpath_base,'01_data','04_GEEdownloadData','02_histdata')
    data = pd.read_csv(os.path.join(hist_inputpath,pre_name+str(1990)+'.csv'));
    data.columns = data.columns.str.replace(rf'^{1990}', '', regex=True)
    columns_sta = data.columns
    hist_start_year = Forecastyear-31;hist_end_year = Forecastyear-1;
    allhist = pd.DataFrame()
    
    for year_hist in range(hist_start_year,hist_end_year+1):
        data = pd.read_csv( os.path.join(hist_inputpath,pre_name+str(year_hist)+'.csv'));
        data.columns = data.columns.str.replace(rf'^{year_hist}', '', regex=True)
        data = data[columns_sta]
        data['idGroup'] = data['idJoin']
        data.drop(['idGroup', 'iso3', '.geo','system:index'], axis=1, inplace=True)#'idGroup', 
        # data.drop(['idGroup', 'iso3', 'lat', 'lon', '.geo','system:index'], axis=1, inplace=True)#'idGroup', 
        data.columns = str(Forecastyear) + data.columns
        data.rename(columns={f"{Forecastyear}idJoin": "idJoin"}, inplace=True)
        data.to_csv(os.path.join(hist_outputpath,'hist_'+str(year_hist)+'.csv'),index=False)

In [5]:
# Print output


In [9]:
'''
[20250106]
# 1. Fixed minor issues in the case of start_point < harvest_point, mainly incorrect replacement of before data
# 2. Added handling for start_point > harvest_point (i.e., cross-year scenarios)

[20250107]
# 1. Fixed cross-year issues: For cross-year growth, the yield data from the start to week 16 should be from the previous year, not the current year
'''

'''
[20250106]
# 1. Fixed minor issues in the case of start_point < harvest_point, mainly incorrect replacement of before data
# 2. Added handling for start_point > harvest_point (i.e., cross-year scenarios)

[20250107]
# 1. Fixed cross-year issues: For cross-year growth, the yield data from the start to week 16 should be from the previous year, not the current year


[20250316]

Fixed historical data errors. Based on the revised code for Pakistan, currently the US, Pakistan, Europe, Argentina, Australia, Canada, and India have been corrected.
Dual-region countries: Russia and the US

'''


for region in regions:
        # Read selected variables for subsequent variable filtering
        Forecastyear = Forecastyears[region]
        SelFeature_infornamtion = extract_selected_variables(inputpath_base)
        TimeFeatures_sel, Static_sel, regionID = SelFeature_infornamtion[SelFeature_infornamtion['regionID'] == region].iloc[0]
        # Actual modeling weeks
        inpath_dates = os.path.join(inputpath_base, '01_data','05_buildmodel', '02_extractdates','gs_three_periods.txt')
        gs_infornamtion = pd.read_csv(inpath_dates, delim_whitespace=True, header=None)
        gs_infornamtion.columns = ['start_point', 'peak', 'harvest_point', 'VI_select2','regionID']
        start_point, peak, harvest_point, VI_select2, region = gs_infornamtion[gs_infornamtion['regionID'] == region].iloc[0]
        print(harvest_point)
        # Data reading and index filtering
        data_ori_all = pd.read_csv(os.path.join(inputpath_base, '01_data','05_buildmodel','01_weekdata',region+'_allweekYielddata_VIs.csv'))
        data_ori_all = data_ori_all.drop_duplicates(subset=['year', 'idJoin'],keep='last')
        Static_sel= [col for col in Static_sel if 'year.1' not in col] 
        TimeFeatures_sel_all= [col for col in data_ori_all.columns if any(feature in col for feature in TimeFeatures_sel)]
        TimeFeatures_sel_all= [col for col in TimeFeatures_sel_all if 'Previous_Yield' not in col] # Note: Previous year's yield may be filtered due to 'Pre' in precipitation, need careful verification
        filtered_columns_all = TimeFeatures_sel_all+Static_sel
        data_ori_all = data_ori_all[filtered_columns_all+['idJoin','Yield']] # Filter selected variables for subsequent analysis
    
        
        # Filter VI for subsequent identification
        filtered_columns_VI = [col for col in data_ori_all.columns if VI_select2 in col]
        data_S2S_VI = data_ori_all[filtered_columns_VI + ['year','idJoin']]
        data_S2S_VI_mean = data_S2S_VI[filtered_columns_VI + ['year']].groupby('year').mean()
        if start_point < harvest_point: # Same-year growth
            hisWeekList = ['leadweek_'+str(week) for week in range(1,harvest_point-start_point+1)] # The set hisWeekList does not seem to include the start_point week
        else:
            hisWeekList = ['leadweek_'+str(week) for week in range(1,harvest_point-start_point+1+46)]+['leadweek_'+str(week) for week in range(1,harvest_point-start_point+1)]
        hist_inputpath = os.path.join(inputpath_base,'02_S2S','03_outputData','02_histdata',region)
        data_ori_current = data_ori_all[data_ori_all['year']==Forecastyear]
        hist_start_year = Forecastyear-30;hist_end_year = Forecastyear-1;
    
        for year_hist in range(hist_start_year,hist_end_year+1):
            # Process data for the current historical year
            data_his_new_ori = pd.read_csv(os.path.join(hist_inputpath,'hist_'+str(year_hist)+'.csv'))
            data_his_new_ori = data_his_new_ori.drop_duplicates(subset=['idJoin'],keep='last')
            data_his_new_ori.set_index('idJoin', inplace=True)
            data_his_new_ori['year'] = Forecastyear
            data_ori_all = data_ori_all.drop_duplicates(subset=['year', 'idJoin'],keep='last')
            data_his_new = data_his_new_ori.copy()
            data_his_new = process_climate_data(data_his_new.reset_index(), Forecastyear, T_upper, T_lower, dynamic_features)
            data_his_new = data_his_new.dropna(how='all',axis=1) # process_climate_data will introduce all vegetation indices
            # The data calculated by data_his_new has errors (missing values not handled), replace with previously interpolated modeling data
            hist_outputpath1 = os.path.join(inputpath_base,'02_S2S','05_WeekData','02_hist',region)
            os.makedirs(hist_outputpath1,exist_ok=True)
            data_his_new.index=data_his_new_ori.index
            data_his_new.to_csv(os.path.join(hist_outputpath1,'hist_'+str(year_hist)+'.csv'))
            
            data_his_new_update = data_ori_current.copy()
            #data_his_new = data_his_new.merge(data_ori_current[filtered_columns_VI+Static_sel+['idJoin','Yield']],on='idJoin',how='inner')# Update VI, static variables and Yield to maintain consistent data types
            data_his_new['year'] = Forecastyear 


            for ii in hisWeekList:
                # data_his_new = data_his_new[filtered_columns_all+['idJoin']]
                ############################################## Fill with most similar vegetation indices, non-cross-year, start_point < harvest_point####
                if int(ii[9:])>harvest_point: # Only need to distinguish whether the forecast week (week_forecast) is cross-year
                    week_forecast = harvest_point+1-int(ii[9:])+46
                else:
                    week_forecast = harvest_point+1-int(ii[9:])

                if start_point < harvest_point:  
                    # Planting and harvesting in the same year
                    forecast_weeklist1 = range(week_forecast, harvest_point + 1)
                    V1= [f'Week{week}{VI_select2}' for week in range(1, week_forecast)]; # Vegetation indices before the forecast period
                    V2= [f'Week{week}{VI_select2}' for week in forecast_weeklist1];# Indices to be forecasted # Forecast from current week to harvest week (week_forecast not included)
            
                    current_S2S_VI_before =data_S2S_VI_mean.loc[Forecastyear][V1]
                    dtw_distances = {}
                    for year1 in range(startyear,Forecastyear):# Will not include the start year to the year before Forecastyear
                        other_S2S_VI_before = data_S2S_VI_mean.loc[year1][V1]
                        distance, path = fastdtw(current_S2S_VI_before, other_S2S_VI_before)# Forecast from current week to harvest week
                        dtw_distances[year1] = distance
                    most_similar_by_dtw = min(dtw_distances, key=dtw_distances.get) # 
                    data_S2S_VI_forecast2 = data_S2S_VI[data_S2S_VI['year'] == most_similar_by_dtw][V2+['idJoin']]# Only data needed for modeling
                    data_his_new_update = data_his_new_update.drop(V2,axis=1) # Delete original columns corresponding to forecast dates, keep non-forecasted ones
                    data_his_new_update = data_his_new_update.merge(data_S2S_VI_forecast2,on='idJoin',how='inner')
                else:
                    ############################################## Fill with most similar vegetation indices, cross-year, start_point > harvest_point####
                    week_forecast = harvest_point+1-int(ii[9:]) # 
                    if week_forecast<=0: # Forecast period is in the previous year (calculated as negative); add 46
                        week_forecast = harvest_point+1-int(ii[9:])+46 # 
                    else:
                        week_forecast = week_forecast

                    if week_forecast<=harvest_point: # = indicates the first week of the current year
                        # Use full-year data because cross-year growth may start in week 1, requiring similar years from the previous year;
                        # Different from same-year reproductive period which usually has a sequence before the forecast
                        forecast_weeklist1 = range(week_forecast, harvest_point+1)
                        V1_1 = [f'Week{week}{VI_select2}' for week in range(1, 46+1)];
                        V1_2 = [f'Week{week}{VI_select2}' for week in range(1, week_forecast)];
                        V2 = [f'Week{week}{VI_select2}' for week in forecast_weeklist1];
                        current_S2S_VI_before =pd.concat([data_S2S_VI_mean.loc[Forecastyear][V1_1], data_S2S_VI_mean.loc[Forecastyear-1][V1_2]])
                        dtw_distances = {}

                        for year1 in range(startyear+1,Forecastyear):# Requires two years of data, will not include Forecastyear
                            other_S2S_VI_before = pd.concat([data_S2S_VI_mean.loc[year1][V1_1], data_S2S_VI_mean.loc[year1-1][V1_2]])
                            distance, path = fastdtw(current_S2S_VI_before, other_S2S_VI_before)# Forecast from current week to harvest week
                            dtw_distances[year1] = distance

                        most_similar_by_dtw = min(dtw_distances, key=dtw_distances.get) 

                        # Only replace modeling data for the current year

                        data_S2S_VI_forecast2 = data_S2S_VI[data_S2S_VI['year'] == most_similar_by_dtw][V2+['idJoin']]
                        data_his_new_update = data_his_new_update.drop(V2,axis=1) # Delete original columns corresponding to forecast dates, keep non-forecasted ones
                        data_his_new_update = data_his_new_update.merge(data_S2S_VI_forecast2,on='idJoin',how='inner')

                    else:  
                        # Cross-year, replacement covers list(range(week_forecast, 46)) + list(range(1, harvest_point + 1))

                        forecast_weeklist1 = list(range(week_forecast, 46+1))+list(range(1,harvest_point + 1))
                        
                        V1_1= [f'Week{week}{VI_select2}' for week in range(1, week_forecast)]; # Previous year
                        V2_1 =  [f'Week{week}{VI_select2}' for week in range(week_forecast, 46+1)]; # Previous year
                        V2_2 =  [f'Week{week}{VI_select2}' for week in range(1,harvest_point + 1)]; # Current year

                        current_S2S_VI_before =data_S2S_VI_mean.loc[Forecastyear-1][V1_1]
                        dtw_distances = {}
                        for year1 in range(startyear+1,Forecastyear-1):# Will not include Forecastyear
                            other_S2S_VI_before = data_S2S_VI_mean.loc[year1-1][V1_1]
                            distance, path = fastdtw(current_S2S_VI_before, other_S2S_VI_before)# Forecast from current week to harvest week
                            dtw_distances[year1] = distance
                        most_similar_by_dtw = min(dtw_distances, key=dtw_distances.get) # Find the most similar year (e.g., 2016)
                            
                        data_S2S_VI_forecast1 = data_S2S_VI[data_S2S_VI['year'] == most_similar_by_dtw][V2_1+['idJoin']]# Only data needed for modeling
                        data_S2S_VI_forecast2 = data_S2S_VI[data_S2S_VI['year'] == most_similar_by_dtw+1][V2_2]# Only data needed for modeling
                        data_S2S_VI_forecast2 = pd.concat([data_S2S_VI_forecast1.reset_index(drop=True), data_S2S_VI_forecast2.reset_index(drop=True)], axis=1)# Horizontal concatenation
                        data_his_new_update = data_his_new_update.drop(V2_1+V2_2,axis=1) # Delete original columns corresponding to forecast dates, keep non-forecasted ones
                        data_his_new_update = data_his_new_update.merge(data_S2S_VI_forecast2,on='idJoin',how='inner')               
                data_his_new_update.set_index('idJoin', inplace=True)
                ############################################## Replace forecast weeks in original data with historical data ###################################################################
                if week_forecast<=harvest_point: # Indicates non-cross-year; only use current year data for replacement
                    update_climate = []
                    for feature in [feature for feature in TimeFeatures_sel if feature != VI_select2[1:]]: # Selected meteorological data excluding vegetation indices
                        update_climate += [f'Week{week}_{feature}' for week in forecast_weeklist1] # Note: Cross-year scenarios may exist, requiring attention
                        
                    data_his_new_update[update_climate] = data_his_new[update_climate] # Replace original data with historical data
                else: # Cross-year requires historical data from the previous year
                    # Read and process data from the previous year for replacement (weeks from week_forecast to 46)
                    data_his_new_ori_lastyear = pd.read_csv(os.path.join(hist_inputpath,'hist_'+str(year_hist-1)+'.csv'))
                    data_his_new_ori_lastyear.set_index('idJoin', inplace=True)
                    data_his_new_ori_lastyear['year'] = Forecastyear
                    data_his_new_lastyear = data_his_new_ori_lastyear.copy()
                    data_his_new_lastyear = process_climate_data(data_his_new_lastyear.reset_index(), Forecastyear, T_upper, T_lower, dynamic_features)
                    data_his_new_lastyear = data_his_new_lastyear.dropna(how='all',axis=1) # process_climate_data will introduce all vegetation indices

                    # Previous year: range(week_forecast, 46)
                    update_climate1 = []
                    for feature in [feature for feature in TimeFeatures_sel if feature != VI_select2[1:]]: # Selected meteorological data excluding vegetation indices
                        update_climate1 += [f'Week{week}_{feature}' for week in list(range(week_forecast, 46))] # 
                    data_his_new_update[update_climate1] = data_his_new_lastyear[update_climate1] # Replace original data with historical data
                    
                    # Current year: 1 to harvest_point
                    update_climate2 = []
                    for feature in [feature for feature in TimeFeatures_sel if feature != VI_select2[1:]]: # Selected meteorological data excluding vegetation indices
                        update_climate2 += [f'Week{week}_{feature}' for week in list(range(1, harvest_point + 1))] # 
                    data_his_new_update[update_climate2] = data_his_new[update_climate2] # Replace original data with historical data
                ############################################## Filter variables for the growing season ############################################################################
                data_his_new_update = data_his_new_update.reset_index()
                weeks = []
                # Determine if cross-year
                if start_point < harvest_point:  # Non-cross-year
                    for feature in TimeFeatures_sel:
                        # Generate combinations of weeks and features using list comprehension
                        weeks += [f'Week{week}_{feature}' for week in range(start_point, harvest_point + 1)]
                    gs_features = weeks + Static_sel+['Yield']+['idJoin']
                    data_his_new_update = data_his_new_update[gs_features]
                else:  # Cross-year
                    for feature in TimeFeatures_sel:
                        # Merge two ranges and generate combinations of weeks and features
                        weeks += [f'Week{week}_{feature}' for week in list(range(start_point, 47)) + list(range(1, harvest_point + 1))]
                     # For cross-year growth, the previous part of the yield should be replaced with data from the previous year (correct in _data_ori; but newly generated data is still from the current year)
                    data= pd.read_csv(os.path.join(inputpath_base, '01_data','05_buildmodel','03_modeldata',region+'_data_ori.csv'))
                    data = data.drop_duplicates(subset=['year', 'idJoin'],keep='last')
                    weeks = []
                    for feature in TimeFeatures_sel:
                        weeks += [f'Week{week}_{feature}' for week in list(range(start_point, 47))]     
                    data = data[weeks+['idJoin','year']];
                    data_his_new_update = data_his_new_update.drop(weeks,axis=1);
                    data = data[data['year']==Forecastyear]
                    data_his_new_update = data_his_new_update.merge(data,on=['idJoin','year'],how='inner')  
                # Replace with previous year's data
                ##############################################Output ############################################################################
                hist_outputpath = os.path.join(inputpath_base,'02_S2S','06_buildmodel','02_hist','VI_Like',region,ii)
                os.makedirs(hist_outputpath,exist_ok=True)
                data_his_new_update.to_csv(os.path.join(hist_outputpath,'hist_'+str(year_hist)+'.csv'),index=False)

19


In [None]:
      

# Find similar years

In [7]:


# data_S2S_new_all_new['year'] == current_S2S_VI['year']