In [1]:
import pandas as pd
import re 
import numpy as np
import urllib

In [2]:
# load list of titles as one column dataframe
df_raw = pd.read_csv("../Organizer_extraction/conserved_titles.txt", low_memory = True, delimiter="\t", names=["title"])

In [28]:
# list of markers for detec sanger, nuclear, and chloroplast info
nuclear_markers = "Internal transcribed spacer|ITS1|ITS2|ETS|nrITS|18S"
chl_markers = "trn[LGR]|atpB|rubisco|rbcL|psbA|trnH|matk|psbJ|petA"

# set catagories of search
categories = {
    #scope
    "Taxonomy": "taxonom|delimit|clas*ific|circumpscri|new species|nueva.? especie",
    "Systematic": "s[yi]stem.t| comple[xj]|phylog|filog|clade|speciation|monophy|monofi|rela[tc]ion|biogeo|evolu",
    # "Other related": "barcod| barra|h[yi]brid",
    "Mexico": "m*[xj]ico",
    #tematics
    "Morphology": "morpholog|morfolog",
    "First gen.": f"{nuclear_markers}|{chl_markers}| sanger | marker| nuclear region| chloroplast region|regi.n.*? nucr|regi.n.*? cloro",
    "Second gen.": "radseq|angiosperm353|hybseq|454|illumina|iontorrent",
    "Third gen.": "pacbio|nanopore",
    "Nuclear info": f"nuclear|{nuclear_markers}",
    "Chloroplast info": f"c.?lorop|{chl_markers}",
    "Parsimony": "parsimon",
    "Maximum likelihood": " ML |maximum likelihood|verosimilitud",
    "Bayesian": "bayes",
    "MSC": " astral | astrid | bpp |coalescent"
    ""
}

In [29]:
test = "A Molecular Phylogenetic Study of the Palmae (Arecaceae) Based on atpB, rbcL, and 18S nrDNA Sequences"
for k in categories:
    print(f"{k}: {re.search(categories[k], test, re.IGNORECASE)}")

Taxonomy: None
Systematic: <re.Match object; span=(12, 18), match='Phylog'>
Other related: None
Mexico: None
Morphology: None
First gen.: <re.Match object; span=(66, 70), match='atpB'>
Second gen.: None
Third gen.: None
Nuclear info: <re.Match object; span=(82, 85), match='18S'>
Chloroplast info: <re.Match object; span=(66, 70), match='atpB'>
Parsimony: None
Maximum likelihood: None
Bayesian: None
MSC: None


In [30]:
# copy original df
df = df_raw.copy()

# create the main columns for the scope of this search and google link
df["REVISADO (bibtex citation)"] = np.nan
df["Google link"] = np.nan
df["Taxonomy"] = np.nan
df["Systematic"] = np.nan
# df["Other related"] = np.nan
df["Mexico"] = np.nan

# iterate row by row
for index, row in df.iterrows():
    # check every key in category dictionary 
    for k in categories:
        # create empty column
        # search every pattern in title
        if re.search(categories[k], row["title"], re.IGNORECASE):
           #add a helper link to go directly to google scholar
           searchquery = urllib.parse.urlencode({"q": row["title"]})
           df.loc[index, "Google link"] = f"https://scholar.google.com/scholar?&{searchquery}"
           #add Auto in the cell to indicate this part of the search is true
           df.loc[index, k] = "Auto"

In [31]:
df

Unnamed: 0,title,Google link,Taxonomy,Systematic,Other related,Mexico,First gen.,Nuclear info,Chloroplast info,Morphology,Maximum likelihood,Parsimony,Third gen.,Bayesian,Second gen.,MSC
0,'Andean-centred' genera in the short-branch cl...,https://scholar.google.com/scholar?&q=%27Andea...,,Auto,,,,,,,,,,,,
1,'Le Rouge et le Noir': A decline in flavone fo...,,,,,,,,,,,,,,,
2,'Mediterranean' disjunction in Senecio sect. S...,https://scholar.google.com/scholar?&q=%27Medit...,,Auto,,,,,,,,,,,,
3,'Real time' genetic manipulation: a new tool f...,,,,,,,,,,,,,,,
4,'What are natural clades in the Caesalpinia gr...,https://scholar.google.com/scholar?&q=%27What+...,,Auto,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12163,trnL-F and rpl16 Sequence Data and Dense Taxon...,https://scholar.google.com/scholar?&q=trnL-F+a...,,Auto,,,Auto,,Auto,,,,,,,
12164,"ural Delineation, Molecular Phylogeny and Flor...",https://scholar.google.com/scholar?&q=ural+Del...,,Auto,,,,,,,,,,,,
12165,ving relationships within Valerianaceae (Dipsa...,https://scholar.google.com/scholar?&q=ving+rel...,,Auto,,,Auto,Auto,,,,,,,,
12166,"xLindsaeosoria flynnii (Lindsaeaceae), Another...",https://scholar.google.com/scholar?&q=xLindsae...,,,Auto,,,,,,,,,,,


In [32]:
# remove rows that has NaN for all categories (out of the scope of this study)
df_final = df[(df['Taxonomy'].notnull()) | (df['Systematic'].notnull()) | (df['Other related'].notnull()) | (df['Mexico'].notnull())]

In [38]:
df[(df['Mexico'].notnull()) & (df['Systematic'].isnull()) & (df['Taxonomy'].isnull())]

Unnamed: 0,title,Google link,Taxonomy,Systematic,Other related,Mexico,First gen.,Nuclear info,Chloroplast info,Morphology,Maximum likelihood,Parsimony,Third gen.,Bayesian,Second gen.,MSC
278,A historical landscape approach to assessing g...,https://scholar.google.com/scholar?&q=A+histor...,,,,Auto,,,,,,,,,,
780,Aberrant Plant Diversity in the Purgatory Wate...,https://scholar.google.com/scholar?&q=Aberrant...,,,,Auto,,,,,,,,,,
781,Aberrant Plant Diversity in the Purgatory Wate...,https://scholar.google.com/scholar?&q=Aberrant...,,,,Auto,,,,,,,,,,
1558,Characterization of 42 microsatellite markers ...,https://scholar.google.com/scholar?&q=Characte...,,,,Auto,Auto,,,,,,,,,
3169,Direct and indirect estimations of gene flow a...,https://scholar.google.com/scholar?&q=Direct+a...,,,,Auto,,,,,,,,,,
3241,Distribution and morphological characteristics...,https://scholar.google.com/scholar?&q=Distribu...,,,,Auto,,,,Auto,,,,,,
3319,Diversity of agaves used to produce pulque in ...,https://scholar.google.com/scholar?&q=Diversit...,,,,Auto,,,,,,,,,,
3635,Evolution and domestication of Lima bean (Phas...,https://scholar.google.com/scholar?&q=Evolutio...,,,,Auto,,,,,,,,,,
3682,Evolution of Tidestromia (Amaranthaceae) in th...,https://scholar.google.com/scholar?&q=Evolutio...,,,,Auto,,,,,,,,,,
3909,Expression Analysis of Genes Involved in the S...,https://scholar.google.com/scholar?&q=Expressi...,,,,Auto,,,,,,,,,,


In [34]:
df_final.to_csv("final_database.csv")

In [35]:
df_final.shape

(4865, 16)