# Test set

## Libraries import

In [None]:
import pandas as pd
import os
import sys
import pickle
import csv

## Paths creation

In [None]:
notebook_directory = os.path.dirname(os.path.abspath('__file__'))
framework_directory = os.path.abspath(os.path.join(notebook_directory, '..'))

sys.path.append(framework_directory)

print(framework_directory)

## Species data creation

In [None]:
columns_occurrences = ['sample_id', 'preferred_taxon', 'domin']
columns_abundance = ['domin', 'midPoint']

npms_occurrences = pd.read_csv(os.path.join(framework_directory, 'Datasets/NPMS/occurrences.csv'), usecols=columns_occurrences)
npms_abundance = pd.read_csv(os.path.join(framework_directory, 'Datasets/NPMS/domin_scores.csv'), usecols=columns_abundance)

npms_occurrences = npms_occurrences.rename(columns={"sample_id": "PlotObservationID", "preferred_taxon": "Matched concept", "domin": "Domin scale"})
npms_abundance = npms_abundance.rename(columns={"domin": "Domin scale", "midPoint": "Cover"})

npms_occurrences = npms_occurrences.merge(npms_abundance, how='left')  # Add cover
npms_occurrences['Cover %'] = npms_occurrences['Cover'] * 100  # Add cover percentage
npms_occurrences.drop(['Domin scale', 'Cover'], axis=1, inplace=True)  # Remove unwanted columns
npms_occurrences = npms_occurrences.dropna()  # Remove rows with missing information

npms_occurrences = npms_occurrences.reset_index(drop=True)
npms_occurrences_esy = npms_occurrences.copy()

npms_occurrences

## GBIF Normalization

In [None]:
from data.preprocess_data import add_gbif_normalization

with open(os.path.join(framework_directory, 'Data/le_species.pkl'), 'rb') as f:
    le_species = pickle.load(f)

with open(os.path.join(framework_directory, 'Data/eva_to_gbif_species.pkl'), 'rb') as f:
    eva_to_gbif_species = pickle.load(f)

original_value_counts = npms_occurrences['PlotObservationID'].value_counts()
npms_occurrences, _ = add_gbif_normalization(npms_occurrences, eva_to_gbif_species)
new_value_counts = npms_occurrences['PlotObservationID'].value_counts()
rows_to_remove = [value for value, count in original_value_counts.items() if value in new_value_counts and new_value_counts[value] < count * 0.75]
npms_occurrences = npms_occurrences[~npms_occurrences['PlotObservationID'].isin(rows_to_remove)]
npms_occurrences = npms_occurrences[npms_occurrences['PlotObservationID'].map(new_value_counts) >= 1]

npms_occurrences

## Header data creation

In [None]:
columns_attributes = ['sample_id', 'caption', 'text_value']
columns_localisation = ['id', 'LATITUDE', 'LONGITUDE']

npms_attributes = pd.read_csv(os.path.join(framework_directory, 'Datasets/NPMS/sample_attributes.csv'), usecols=columns_attributes)
npms_localisation = pd.read_csv(os.path.join(framework_directory, 'Datasets/NPMS/sample_info.csv'), usecols=columns_localisation)
npms_habitat_lookup = pd.read_csv(os.path.join(framework_directory, 'Datasets/NPMS/npms_habitat_lookup.csv'))

npms_attributes = npms_attributes.rename(columns={"sample_id": "PlotObservationID", "text_value": "Habitat"})
npms_localisation = npms_localisation.rename(columns={"id": "PlotObservationID", "LONGITUDE": "Longitude", "LATITUDE": "Latitude"})
npms_habitat_lookup = npms_habitat_lookup.rename(columns={"NPMS_broad_habitat": "Broad habitat", "NPMS_fine-scale_habitat": "Fine-scale habitat"})

npms_attributes = npms_attributes[npms_attributes['caption'] == 'NPMS Habitat']  # Keep rows with habitat
npms_attributes = npms_attributes.merge(npms_localisation, how='left')  # Add location information
npms_attributes = npms_attributes[['PlotObservationID', 'Longitude', 'Latitude', 'Habitat']]  # Change columns order
grouped_habitats = npms_habitat_lookup.groupby("Broad habitat")["Fine-scale habitat"].nunique()  # Group the habitats by broad habitats
filtered_habitats = grouped_habitats[grouped_habitats == 1].index  # Retrieve names of broad habitats that contain exactly one fine-scale habitat
npms_habitat_lookup = npms_habitat_lookup[npms_habitat_lookup["Broad habitat"].isin(filtered_habitats)]  # Keep only those broad habitats in the habitat lookup info
npms_attributes = npms_attributes.merge(npms_habitat_lookup, left_on="Habitat", right_on="Broad habitat", how="left")  # Merge the attributes DataFrame with the habitat lookup info DataFrame
npms_attributes["Habitat"] = npms_attributes["Fine-scale habitat"].fillna(npms_attributes["Habitat"])  # Replace broad habitats that contain only one fine-scale habitats by their fine-scale habitat
npms_attributes = npms_attributes.drop(["Broad habitat", "Fine-scale habitat"], axis=1)  # Drop unwanted columns
npms_attributes = npms_attributes[npms_attributes['Habitat'] != 'Not in scheme']  # Remove rows when the habitat is not in the scheme
npms_attributes = npms_attributes.dropna()  # Remove rows with missing information
npms_attributes = npms_attributes.reset_index(drop=True)  # Reset the DataFrame index

npms_attributes

## EUNIS code conversion

In [None]:
columns_codes = ['EUNIS 2020 code', 'EUNIS 2007 code']

codes = pd.read_excel(os.path.join(framework_directory, 'Datasets/eunis_habitats.xlsx'), usecols=columns_codes)
code_dict = {}

for index, row in codes.iterrows():
    codes_2007 = str(row['EUNIS 2007 code']).split(';')
    for code_2007 in codes_2007:
        code_2007 = code_2007.strip()
        if code_2007 == 'nan':
            continue
        code_2020 = str(row['EUNIS 2020 code'])
        if code_2007 in code_dict:
            code_dict[code_2007].append(code_2020)
        else:
            code_dict[code_2007] = [code_2020]

for code_2007, code_2020 in code_dict.items():
    print(code_2007, "->", code_2020)

## Header data cleaning

In [None]:
columns_habitats = ['NPMS', 'EUNIS']
npms_habitats = pd.read_csv(os.path.join(framework_directory, 'Datasets/NPMS/NPMS_EUNIS.csv'), usecols=columns_habitats)

with open(os.path.join(framework_directory, 'Data/le_header.pkl'), 'rb') as f:
    le_header = pickle.load(f)

npms_attributes['NPMS'] = npms_attributes['Habitat'].copy()
npms_attributes = npms_attributes.rename(columns={"Habitat": "EUNIS"})
npms_attributes = npms_attributes[npms_attributes['EUNIS'].isin(npms_habitats['NPMS'].values)]
npms_attributes.loc[:, 'EUNIS'] = npms_attributes['EUNIS'].map(npms_habitats.set_index('NPMS')['EUNIS'])

for index, row in npms_attributes.iterrows():
    old_codes = str(row['EUNIS']).split(', ')
    new_codes = []
    for old_code in old_codes:
        old_code = old_code.strip()
        if old_code in code_dict:
            new_codes.extend(code_dict[old_code])
        else:
            new_codes.append('~')
    npms_attributes.at[index, 'EUNIS'] = ', '.join(new_codes)

for index, row in npms_attributes.iterrows():
    codes_list = row['EUNIS'].split(', ')  # Split codes by comma
    new_codes = []
    for code in codes_list:
        if code in codes['EUNIS 2020 code'].values:
            if code.startswith('MA2') and len(code) != 5:  # Replace code starting with 'MA2' and length not equal to 5
                if code.startswith('MA2') and len(code) == 3:
                    new_codes += list(codes[codes['EUNIS 2020 code'].str.startswith(code[:3]) & (codes['EUNIS 2020 code'].str.len() == 5)]['EUNIS 2020 code'])
                elif code.startswith('MA2') and len(code) == 4:
                    new_codes += list(codes[codes['EUNIS 2020 code'].str.startswith(code[:4]) & (codes['EUNIS 2020 code'].str.len() == 5)]['EUNIS 2020 code'])
            elif not code.startswith('MA2') and len(code) != 3:  # Replace code with length not equal to 3
                if not code.startswith('MA2') and len(code) == 1:
                    new_codes += list(codes[codes['EUNIS 2020 code'].str.startswith(code[:1]) & (codes['EUNIS 2020 code'].str.len() == 3)]['EUNIS 2020 code'])
                elif not code.startswith('MA2') and len(code) == 2:
                    new_codes += list(codes[codes['EUNIS 2020 code'].str.startswith(code[:2]) & (codes['EUNIS 2020 code'].str.len() == 3)]['EUNIS 2020 code'])
            else:
                new_codes.append(code)  # Keep original code if length is correct
        else:
            new_codes.append(code)  # Keep original code if not present in codes DataFrame

    npms_attributes.at[index, 'EUNIS'] = ', '.join(new_codes)

npms_attributes['noise percentage'] = npms_attributes['EUNIS'].apply(lambda x: sum(label == '~' or label not in le_header.classes_ for label in x.split(', ')) / len(x.split(', ')) * 100)
npms_attributes = npms_attributes[npms_attributes['noise percentage'] <= 50]
npms_attributes = npms_attributes.drop('noise percentage', axis=1)
npms_attributes.loc[:, 'EUNIS'] = npms_attributes['EUNIS'].str.split(', ') # Split the codes into a list in each row
npms_attributes.loc[:, 'EUNIS'] = npms_attributes['EUNIS'].apply(lambda x: list(set(x))) # Remove duplicate codes in each row
npms_attributes.loc[:, 'EUNIS'] = npms_attributes['EUNIS'].apply(lambda x: [code for code in x if code != '~' and code in le_header.classes_])
npms_attributes.loc[:, 'EUNIS'] = npms_attributes['EUNIS'].apply(lambda x: ', '.join(sorted(x)))  # Join the codes back into a single string in each row
npms_attributes = npms_attributes[npms_attributes['PlotObservationID'].isin(npms_occurrences['PlotObservationID'])]
npms_attributes = npms_attributes[npms_attributes['EUNIS'].str.count(',') < 5]  # Only keep vegetation plots corresponding to 5 or less EUNIS habitats
npms_attributes = npms_attributes.dropna()
npms_attributes = npms_attributes.reset_index(drop=True)
npms_attributes

## Species data cleaning

In [None]:
npms_occurrences = npms_occurrences[npms_occurrences['PlotObservationID'].isin(npms_attributes['PlotObservationID'])]
npms_occurrences = npms_occurrences.reset_index(drop=True)
npms_occurrences

## EUNIS-ESy test set

In [None]:
npms_occurrences_esy = npms_occurrences_esy[npms_occurrences_esy['PlotObservationID'].isin(npms_occurrences['PlotObservationID'])]
npms_occurrences_esy, _ = add_gbif_normalization(npms_occurrences_esy, None)

npms_occurrences_esy = npms_occurrences_esy.rename(columns={'PlotObservationID': 'RELEVE_NR', 'Matched concept': 'TaxonName', 'Cover %': 'Cover_Perc'})
npms_occurrences_esy['Cover_Perc'] = npms_occurrences_esy['Cover_Perc'].astype(int)

npms_occurrences_esy

## Target values retrieval

In [None]:
y = npms_attributes['EUNIS'].values
npms_attributes.drop(['EUNIS', 'NPMS'], axis=1, inplace=True)  # Remove unwanted columns

y

## Data saving

In [None]:
npms_attributes.to_csv(os.path.join(framework_directory, 'Datasets/test_header.csv'), index=False, sep='\t')
npms_occurrences.to_csv(os.path.join(framework_directory, 'Datasets/test_species.csv'), index=False, sep='\t')
npms_occurrences_esy.to_csv(os.path.join(framework_directory, 'Experiments/ESy/data/test_species_esy.csv'), index=False, sep=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
with open(os.path.join(framework_directory, 'Data/test_values.txt'), 'w') as file:
    for value in y:
        file.write(str(value) + '\n')