In [1]:
import pandas as pd
import re 
import numpy as np
import os.path

import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import homogenize_latex_encoding
from bibtexparser.customization import convert_to_unicode

#define parser  method for bibtex
bib_parser = BibTexParser()
bib_parser.customization = convert_to_unicode
# bib_parser.customization = homogenize_latex_encoding

from fuzzywuzzy import fuzz

## Copy database

In [2]:
from shutil import copyfile
copyfile("../5_Merge/database_for_tematic_extraction.csv", "bare_final_database.csv")

'bare_final_database.csv'

## Definition function to search tematics

In [3]:
# Function definition for tematic addition
def add_tematic_column(file, regex, fields_to_search, search_title, overwrite=True, sep=",", index_col=0):
    """
    Open database and search reges in fields_to_search and append column
    """
    # load database
    df = pd.read_csv(file, low_memory = True, delimiter=sep, index_col=index_col)

    # code title
    search_title = f"t.{search_title}"
    search_title= re.sub("\s", "_", search_title)

    # create empty column for result
    df[search_title] = np.nan

    # iterate row by row
    for index, row in df.iterrows(): 
        # search every pattern in title
        for field in fields_to_search:
            try: 
                if re.search(regex, row[field], re.IGNORECASE):
                    #add Auto in the cell to indicate this part of the search is true
                    df.loc[index, search_title] = "1"
            except:
                continue
            
    if overwrite:
        df.to_csv(file)
        return f"Found {df[df[search_title].notnull()][search_title].shape[0]} for {search_title} regex ({regex}) in {fields_to_search}, {file} overwrote!"
    else:
        return df[df[search_title].notnull()][search_title]

## Definition of some topics

In [4]:
# list of markers for detec sanger, nuclear, and chloroplast info
nuclear_markers = "Internal transcribed spacer|ITS1|ITS2|ETS|nrITS|18S|59S|5.8s|GBSSI|COSII|25S|Ypr10|pgiC|ETS|ADH2F|ADH3R|Aat|Skdh|Pgi.?2|Tpi.?1|Tpi.?2|AFLP"
chl_markers = "trnL|trnG|trnR|trnT|trnS|trnF|trnH|trnQ|trnD|trnY|trnE|atpB|rubisco|rbcL|psbA|matk|psbJ|petA|ndhF|psaA|ycf3|ycf4|rpl16|trnSfM|trnfM|rps4|cemA|psbM|trnDGUC"


               
# set catagories of search
categories = {
    "Mexico": "(?<!new)(?<!nuevo)\s+m.[xj]ic",
    "Morphology": "morpholog|morfolog",
    "First gen.": f"{nuclear_markers}|{chl_markers}| sanger | marker| nuclear region| chloroplast region|regi.n.*? nucr|regi.n.*? cloro| spacer|AFLP|Microsat.ll?ite",
    "Second gen.": "radseq|angiosperm353|hybseq|454|illumina|iontorrent|whole genome|plastome",
    "Third gen.": "pacbio|nanopore",
    "Nuclear info": f"nuclear|{nuclear_markers}",
    "Chloroplast info": f"c.?lorop|{chl_markers}",
    #add mitocondrion
    "Parsimony": "parsimon",
    "Maximum likelihood": " ML |maximum likelihood|verosimilitud|iqtree|iq-tree",
    "Bayesian": "bayes| beast",
    "MSC": " astral | astrid | bpp |coalescent",
    "New species": "new species|nueva. especie",
    "New genus": "new genus|nueva.? g.nero",
}

## Add tematics in previous topics (major topics)

In [5]:
# iterate over ALL categories. Replace previous results!
for category in categories:
    add_tematic_column("bare_final_database.csv", categories[category], ["c.abstract", "c.title"], category)

In [6]:
#For individual tematics use add_tematic_column method

## Manual families extraction -- Should be replaced by another method

In [7]:
#Get families from abstracts

file = "bare_final_database.csv"
df = pd.read_csv(file, low_memory = True, index_col=0)

fields_to_search = ["c.abstract", "c.title"]

#Load list of families by major groups
with  open("../Data_ThePlantList/Angiosperms.txt", "r") as txtfile: 
    angiosperms = [line.strip() for line in txtfile]
    
with  open("../Data_ThePlantList/Bryophytes.txt", "r") as txtfile: 
    bryophytes = [line.strip() for line in txtfile]
    
with  open("../Data_ThePlantList/Gymnosperms.txt", "r") as txtfile: 
    gymnosperms = [line.strip() for line in txtfile]
    
with  open("../Data_ThePlantList/Pteridophytes.txt", "r") as txtfile: 
    pteridophytes = [line.strip() for line in txtfile]

all_families = angiosperms + bryophytes + gymnosperms + pteridophytes

for index, row in df.iterrows(): 
    families = []
    groups = []
    for field in fields_to_search:
        try:
#             [families.append(i.lower()) for i in re.findall("[A-Za-z]+(?<=ceae)", row[field], re.IGNORECASE) if i not in families]
            for family in re.findall("[A-Za-z]+(?<=ceae)", row[field], re.IGNORECASE):
                family_to_add = ""
                #check family and correct
                family_corrected = [s for s in all_families if fuzz.token_sort_ratio(s, family) >= 80]
                if len(family_corrected) == 1:
                    family_to_add = family_corrected[0]
#                     print(family, "corrected", family_corrected[0])
                else:                    
                    family_to_add = family
#                     print(family, "no corrected")
                
                if family_to_add not in families:
                    families.append(family_to_add.lower())
                
                
                #check group
                if family_to_add in angiosperms:  df.loc[index, "t.angiosperms"] = 1
                elif family_to_add in bryophytes: df.loc[index, "t.bryophytes"] = 1
                elif family_to_add in gymnosperms: df.loc[index, "t.gymnosperms"] = 1
                elif family_to_add in pteridophytes: df.loc[index, "t.pteridophytes"] = 1
                

        except:
            continue
    
    #add to df
    list_of_families = ",".join(set(families))
    df.loc[index, "t.families"] = list_of_families

df.to_csv(file)

## Add type of journal (open source, scielo), country, Quartil and Impact factor (year based)

In [None]:
# search journal in SCIMago databases and add info to the main database

file = "bare_final_database.csv"
df = pd.read_csv(file, low_memory = True, index_col=0)

#iterarte by year to open only once each big database
for year in range(1999, 2021):
    
    print(year)
        
    scimagoFULL_df = pd.read_csv(f"../Data_scimago/all_journals/scimagojr {year}.csv", low_memory = True, delimiter=";", index_col=0, error_bad_lines=False)
    scimagoOPEN_df = pd.read_csv(f"../Data_scimago/opensource_journals/scimagojr {year}.csv", low_memory = True, delimiter=";", index_col=0, error_bad_lines=False)
    scimagoSCIELO_df = pd.read_csv(f"../Data_scimago/scielo_journals/scimagojr {year}.csv", low_memory = True, delimiter=";", index_col=0, error_bad_lines=False)

    

    for index, row in df.iterrows():
        journal = row["c.journal"]
        paper_year = row["c.year"]

        #avoid nulls
        if isinstance(journal, str):

            #check quartil, impact factor and country from general database
            result = scimagoFULL_df[scimagoFULL_df["Title"].str.lower() == journal.lower()]

            if result.shape[0] > 0:
                # check year for information that change yearly
                if paper_year == year:
                    if not isinstance(result["SJR"].values[0], float):
                        _sjr = float(result["SJR"].values[0].replace(',', '.'))
                    else:
                        _sjr = result["SJR"].values[0]
                    df.loc[index, "s.sjr"] = _sjr
                    df.loc[index, "s.q"] = result["SJR Best Quartile"].values[0]
                    df.loc[index, "s.h"] =  result["H index"].values[0]
                    
                #other static information, just added
                df.loc[index, "s.country"] =  result["Country"].values[0]
                df.loc[index, "s.scimago"] = 1

            #check if exist in opensource databases (THIS COULD BE SENSITIVE TO YEAR TOO, CHECK IF CHANGES)
            result = scimagoOPEN_df[scimagoOPEN_df["Title"].str.lower() == journal.lower()]
            if result.shape[0] > 0:
                df.loc[index, "s.opensource"] = 1

                
            #check if exist in scielo database
            result = scimagoSCIELO_df[scimagoSCIELO_df["Title"].str.lower() == journal.lower()]
            if result.shape[0] > 0:
                df.loc[index, "s.scielo"] = 1


    
    df.to_csv(file)

1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


2012
2013
2014


b'Skipping line 32711: expected 20 fields, saw 21\nSkipping line 32712: expected 20 fields, saw 21\n'


2015


b'Skipping line 20760: expected 20 fields, saw 21\nSkipping line 22914: expected 20 fields, saw 21\n'


2016


b'Skipping line 19393: expected 20 fields, saw 21\nSkipping line 19394: expected 20 fields, saw 21\n'
b'Skipping line 33757: expected 20 fields, saw 21\n'


2017


b'Skipping line 19053: expected 20 fields, saw 21\nSkipping line 20953: expected 20 fields, saw 21\n'


2018


## Add markers used

ADD MITOCONDRION

In [None]:
#load database
file = "bare_final_database.csv"
df = pd.read_csv(file, low_memory = True, index_col=0)

#define where to search
fields_to_search = ["c.abstract", "c.title"]

#define some common markers
markers = {
    "18S-26S": ["18S-26S"],
    "18S": ["[ (]18S"],
    "ADH2F-ADH3R": ["ADH2F-ADH3R"],
    "AFLPs": ["AFLP", "Amplified fragment length polymorphism"],
    "aptF-aptH": ["aptF-aptH"],
    "atpB-rbcL": ["atpB-rbcL"],
    "atpB": ["[ (]atpB"],
    "clpp": ["clpp"],
    "COSII": ["COSII"],
    "ETS": ["ETS"],
    "GBSSI": ["GBSSI"],
    "ITS": ["ITS", "ITS1-5.8s-ITS2", "ITS1", "5.8s", "ITS2", "Internal transcribed spacer", "59S"],
    "matK": ["matK"],
    "ndhA": ["ndhA"],
    "ndhF": ["ndhF"],
    "petB": ["petB"],
    "pgiC": ["pgiC"],
    "psaA-ycf3": ["psaA-ycf3"],
    "psaC-ndhE": ["psaC-ndhE"],
    "psaJ-rpl33": ["psaJ-rpl33"],
    "psbE-petL": ["psbE-petL"],
    "psbJ‐petA": ["psbJ‐petA"],
    "psbL-trnS": ["psbL-trnS[-AUGC]*"],
    "psbM-trnD": ["psbM-trnD[-AUGC]*"],
    "psbZ-trnG": ["psbZ-trnG[-AUGC]*"],
    "rbcL": ["[ (]rbcL","RuBisCO", "rubisco"],
    "rpl12-clpp": ["rpl12-clpp"],
    "rpl32-trnL": ["rpl32-trnL[-AUGC]*"],
    "rpoB-psbZ": ["rpoB[-AUGC]*-psbZ[-AUGC]*", "BZ"],
    "rpoB-trnC-GCA": ["rpoB-trnC[-AUGC]*"],
    "rpoC2-rpoC1": ["rpoC2-rpoC1"],
    "rps16-trnQ": ["rps16-trnQ[-AUGC]*"],
    "rps19": ["rps19"],
    "rps2-rpoc2": ["rps2-rpoc2"],
    "rps4-trnS ": ["rps4-trnS[-AUGC]*"],
    "trnC-ycf6": ["trnC[-AUGC]*-ycf6"],
    "trnD-psbM": ["trnD[-AUGC]*-psbM"],
    "trnD-trnT": ["trnD[-AUGC]*‐trnY[-AUGC]*‐trnE[-AUGC]*‐trnT[-AUGC]*"],
    "trnD-trnY": ["trnD-AUGC]*-trnY-AUGC]*"],
    "trnD‐trnY": ["trnD[-AUGC]*‐trnY[-AUGC]*"],
    "trnG-trnfM" : ["trnG[-AUGC]*-trnfM[-AUGC]*"],
    "trnH-psbA": ["trnH[-AUGC]*-psbA", "psbA-trnH[-AUGC]*"],
    "trnK-rps16": ["trnK[-AUGC]*-rps16"],
    "trnL-trnF": ["trnL[-AUGC]*-trnF[-AUGC]*", "trnL-F", "trnF-trnL"],
    "trnM-atpE": ["trnM[-AUGC]*-atpE"],
    "trnP-psaJ": ["trnP[-AUGC]*-psaJ"],
    "trnQ-psbK": ["trnQ[-AUGC]*-psbK[-AUGC]*"],
    "trnR-atpA": ["trnR[-AUGC]*-atpA"],
    "trnS-psbZ": ["trnS[-AUGC]*-psbZ[-AUGC]*"],
    "trnS-rps4": ["trnS[-AUGC]*-rps4"],
    "trnS-trnG": ["trnS[-AUGC]*-trnG[-AUGC]*"],
    "trnS‐trnfM ": ["trnS[-AUGC]*‐trnfM[-AUGC]*"],
    "trnT-trnE": ["trnT[-AUGC]*-trnE[-AUGC]*", "trnE[-AUGC]*‐trnT[-AUGC]*"],
    "trnT-trnF": ["trnT[-AUGC]*-trnF[-AUGC]*", "trnF[-AUGC]*‐trnT[-AUGC]*"],
    "trnT-trnL": ["trnT[-AUGC]*-trnL[-AUGC]*", "trnT-L"],
    "trnT-trntL": ["trnT[-AUGC]*-trntL[-AUGC]*"],
    "trnY‐trnE": ["trnY[-AUGC]*‐trnE[-AUGC]*"],
    "ycf1": ["ycf1"],
    "ycf15-ycf1": ["ycf15-ycf1"],
    "ycf4-cemA": ["ycf4-cemA"],
    "Ypr10": ["Ypr10"],
    "Microsatellites": ["Microsat.ll?ite"],
          }

#iterate row by row
for index, row in df.iterrows(): 
    #create empty result for each row
    result_markers = []
    #iterate for all markers in dictionary
    for marker in markers.keys():
        
        #search in each field defined
        for field in fields_to_search:
            #replace all ndash and mdash to normal dash en each field
            string_field = row[field]
            for r in (("–", "-"), ("—", "-")):
                if isinstance(string_field, str):
                    string_field = string_field.replace(*r)
                
            #join string_fieldin all synonyms for a given marker
            regex_string = "|".join(markers[marker])
            #search for all elements
            try:
                search = re.findall(regex_string, string_field)

                if search:
                    result_markers.append(marker)
            except:
                continue

    if result_markers:
        df.loc[index, "t.markers"] = ",".join(set(result_markers))
    else:
        df.loc[index, "t.markers"] = np.nan

                
df.to_csv(file)

## Add species per study and check mexican endmics

In [None]:
#load database
file = "bare_final_database.csv"
df = pd.read_csv(file, low_memory = True, index_col=0)

#put endemic species into a list
with open("../Data_endemicSpeciesMexico/endemics_uniques.txt", "r") as fendemics:
    data = fendemics.read()
    endemic_mx = data.splitlines()

list_endemic_species = []

for index, row in df.iterrows():
    result_species = []
    result_species_endemic = []

    #open raw database file
    with open("../1_Datataxa_extraction/FIRSTTEST.csv", mode="r") as raw:
        #load lines
        lines = raw.readlines()
        #iterate over each row (aka species)
        for line in lines[1:]:
            publications = None
            species = None
            #manually parse each column
            cells = line.split("\",\"")
            if len(cells) >= 2:
                publications = cells[2].split("|")
            if len(cells) >= 5:
                species = cells[5][:-2]
            
            if publications:
                #Check if old title is in publications in raw database
                if row["a.oldtitle"] in publications:
                    result_species.append(species)
                    if species in endemic_mx:
                        result_species_endemic.append(species)
                        list_endemic_species.append(species)
                        df.loc[index, "s.includedEndemicMXSpp"] = 1
    
    df.loc[index, "t.species"] = ",".join(result_species)
    df.loc[index, "t.endemicMXspecies"] = ",".join(result_species_endemic)

df.to_csv(file)