In [2]:
import pandas as pd

In [None]:
df_taxonomia = pd.read_excel('../Datos EIDOS/EIDOS_taxonomia.xlsx')

# Quitar parentesis y palabras que empiezan por mayuscula y numeros
df_taxonomia['lowered_name'] = df_taxonomia['name'].str.replace(r'[()&]', '', regex=True).apply(
    lambda x: x[0].lower() + ' '.join([w for w in x[1:].split() if w == w.lower() and not w.isdigit()])
    )

In [None]:
def process_file(namefile):
    df = pd.read_csv(namefile)

    def find_best_match(species):
        row_words = species.replace('(', '').replace(')', '').lower().split()
        if len(row_words) < 2:
            return None # No hay suficientes palabras para buscar
        # Filtrar candidatos
        candidates = df_taxonomia[df_taxonomia['lowered_name'].str.contains(' '.join(row_words[:2]))]
        # Si hay una categoría taxonómica, refinar los candidatos
        if len(row_words) > 3 and row_words[2] in ['f.', 'subsp.', 'var.']:
            candidates = candidates[candidates['lowered_name'].str.contains(f' {row_words[3]} ')]
        else:
            candidates = candidates[~candidates['lowered_name'].str.contains(r'\b(?: f\. | subsp\. | var\. )\b', regex=True)]
        if candidates.empty:
            return None
        # Calcular las puntuaciones de coincidencia
        row_words_set = set(row_words)
        candidates['score'] = candidates['lowered_name'].apply(lambda c: 
            0.9 * len(set(c.split()) & row_words_set) / len(row_words) +
            0.1 * len(set(c.split()) & row_words_set) / len(c.split())
        )
        # Obtener el mejor candidato
        best_match = candidates.loc[candidates['score'].idxmax()]
        return best_match['taxonid']

    species_id = {species: find_best_match(species) for species in set(df["scientificName"].to_list())}
    df['idtaxon'] = df["scientificName"].apply(lambda x: species_id[x])
    df.to_csv(f"processed_{namefile}", index=False)

In [None]:
process_file("observations_con_cuadricula_1.csv")

In [None]:
process_file("observations_con_cuadricula_2.csv")

In [3]:
processed_file_1 = "processed_observations_con_cuadricula_1.csv"
processed_file_2 = "processed_observations_con_cuadricula_2.csv"

df_1 = pd.read_csv(processed_file_1)
df_2 = pd.read_csv(processed_file_2)

df_1 = df_1.drop(['modified', 'informationWithheld', 'occurrenceRemarks', 
              'identifiedBy', 'identifiedByID', 'eventTime', 'verbatimEventDate', 
              'verbatimLocality', 'coordinateUncertaintyInMeters', 'countryCode',
              'stateProvince', 'identificationID', 'dateIdentified', 'identificationRemarks',
              'taxonID', 'kingdom', 'phylum', 'class', 'order', 'family',
              'genus', 'inaturalistLogin', 'publishingCountry', 'projectId',
              'lifeStage', 'reproductiveCondition', 'vitality', 'dynamicProperties', 'geometry'], 
              axis=1)

df_2 = df_2.drop(['modified', 'informationWithheld', 'occurrenceRemarks', 
              'identifiedBy', 'identifiedByID', 'eventTime', 'verbatimEventDate', 
              'verbatimLocality', 'coordinateUncertaintyInMeters', 'countryCode',
              'stateProvince', 'identificationID', 'dateIdentified', 'identificationRemarks',
              'taxonID', 'kingdom', 'phylum', 'class', 'order', 'family',
              'genus', 'inaturalistLogin', 'publishingCountry', 'projectId',
              'lifeStage', 'reproductiveCondition', 'vitality', 'dynamicProperties', 'geometry'], 
              axis=1)

print(df_1.columns)

Index(['id', 'occurrenceID', 'references', 'recordedBy', 'recordedByID',
       'eventDate', 'decimalLatitude', 'decimalLongitude', 'scientificName',
       'taxonRank', 'license', 'rightsHolder', 'sex', 'malla_codigo',
       'idtaxon'],
      dtype='object')


In [None]:
df_1.to_csv("processed_observations_con_cuadricula_1.csv", index=False)
df_2.to_csv("processed_observations_con_cuadricula_2.csv", index=False)