In [5]:
import pandas as pd
import re 
import numpy as np
import urllib

In [6]:
# load list of titles as one column dataframe
df_raw = pd.read_csv("../2b_Organizer_extraction - SRA/conserved_titles.txt", low_memory = True, delimiter="\t", names=["title"])

In [7]:
# list of markers for detec sanger, nuclear, and chloroplast info
nuclear_markers = "Internal transcribed spacer|ITS1|ITS2|ETS|nrITS|18S"
chl_markers = "trn[LGR]|atpB|rubisco|rbcL|psbA|trnH|matk|psbJ|petA"

# set catagories of search
categories = {
    #scope
    "Taxonomy": "taxonom|delimit|clas*ific|circumpscri|new species|nueva.? especie",
    "Systematic": "s[yi]stem.t| comple[xj]|phylog|filog|clade|speciation|monophy|monofi|rela[tc]ion|biogeo|evolu",
    # "Other related": "barcod| barra|h[yi]brid",
    "Mexico": "m*[xj]ico",
    #tematics
    "Morphology": "morpholog|morfolog",
    "First gen.": f"{nuclear_markers}|{chl_markers}| sanger | marker| nuclear region| chloroplast region|regi.n.*? nucr|regi.n.*? cloro",
    "Second gen.": "radseq|angiosperm353|hybseq|454|illumina|iontorrent",
    "Third gen.": "pacbio|nanopore",
    "Nuclear info": f"nuclear|{nuclear_markers}",
    "Chloroplast info": f"c.?lorop|{chl_markers}",
    "Parsimony": "parsimon",
    "Maximum likelihood": " ML |maximum likelihood|verosimilitud",
    "Bayesian": "bayes",
    "MSC": " astral | astrid | bpp |coalescent"
    ""
}

In [8]:
test = "A Molecular Phylogenetic Study of the Palmae (Arecaceae) Based on atpB, rbcL, and 18S nrDNA Sequences"
for k in categories:
    print(f"{k}: {re.search(categories[k], test, re.IGNORECASE)}")

Taxonomy: None
Systematic: <re.Match object; span=(12, 18), match='Phylog'>
Mexico: None
Morphology: None
First gen.: <re.Match object; span=(66, 70), match='atpB'>
Second gen.: None
Third gen.: None
Nuclear info: <re.Match object; span=(82, 85), match='18S'>
Chloroplast info: <re.Match object; span=(66, 70), match='atpB'>
Parsimony: None
Maximum likelihood: None
Bayesian: None
MSC: None


In [9]:
# copy original df
df = df_raw.copy()

# create the main columns for the scope of this search and google link
df["REVISADO (bibtex citation)"] = np.nan
df["Google link"] = np.nan
df["Taxonomy"] = np.nan
df["Systematic"] = np.nan
# df["Other related"] = np.nan
df["Mexico"] = np.nan

# iterate row by row
for index, row in df.iterrows():
    # check every key in category dictionary 
    for k in categories:
        # create empty column
        # search every pattern in title
        if re.search(categories[k], row["title"], re.IGNORECASE):
           #add a helper link to go directly to google scholar
           searchquery = urllib.parse.urlencode({"q": row["title"]})
           df.loc[index, "Google link"] = f"https://scholar.google.com/scholar?&{searchquery}"
           #add Auto in the cell to indicate this part of the search is true
           df.loc[index, k] = "Auto"

In [10]:
df

Unnamed: 0,title,REVISADO (bibtex citation),Google link,Taxonomy,Systematic,Mexico,Second gen.,Third gen.,Chloroplast info,Morphology,First gen.,Nuclear info
0,cultivar:25e11-NonTol Genome sequencing,,,,,,,,,,,
1,cultivar:25e11-Tol Genome sequencing,,,,,,,,,,,
2,cultivar:Allahabad Safeda Genome sequencing a...,,,,,,,,,,,
3,cultivar:Bridgeton Genome sequencing and asse...,,,,,,,,,,,
4,cultivar:CCC population Genome sequencing,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4030,sweet potato starch content transcriptome,,,,,,,,,,,
4031,sweetpotato storage root Raw sequence reads,,,,,,,,,,,
4032,transcriptome covering the whole development p...,,,,,,,,,,,
4033,transcriptomic of exogenous GABA in poplar(Pop...,,,,,,,,,,,


In [15]:
# remove rows that has NaN for all categories (out of the scope of this study)
df_final = df[(df['Taxonomy'].notnull()) | (df['Systematic'].notnull()) | (df['Mexico'].notnull())]

In [16]:
df_final

Unnamed: 0,title,REVISADO (bibtex citation),Google link,Taxonomy,Systematic,Mexico,Second gen.,Third gen.,Chloroplast info,Morphology,First gen.,Nuclear info
10,1) 37 diploid Helianthus species with Phoebant...,,https://scholar.google.com/scholar?&q=1%29+37+...,,Auto,,,,,,,
22,3D genome reorganizations in the evolution and...,,https://scholar.google.com/scholar?&q=3D+genom...,,Auto,,,,,,,
24,A New Pipeline for Removing Paralogs in Target...,,https://scholar.google.com/scholar?&q=A+New+Pi...,,Auto,,,,,,,
27,A VIEW OF RHYNCHOSPOREAE (CYPERACEAE) DIVERSIF...,,https://scholar.google.com/scholar?&q=A+VIEW+O...,,Auto,,,,,,,
28,A backbone phylogeny of the American clade oak...,,https://scholar.google.com/scholar?&q=A+backbo...,,Auto,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3906,We included 85 samples of species of Bursera (...,,https://scholar.google.com/scholar?&q=We+inclu...,,Auto,,,,,,,
3923,Whole-Genome Duplication and Acceleration of M...,,https://scholar.google.com/scholar?&q=Whole-Ge...,,Auto,,,,,,,
3931,Widespread adaptive evolution during repeated ...,,https://scholar.google.com/scholar?&q=Widespre...,,Auto,,,,,,,
3992,double digest RAD data from Berberis spp. (Ber...,,https://scholar.google.com/scholar?&q=double+d...,,,Auto,,,,,,


In [17]:
df[(df['Mexico'].notnull()) & (df['Systematic'].isnull()) & (df['Taxonomy'].isnull())]

Unnamed: 0,title,REVISADO (bibtex citation),Google link,Taxonomy,Systematic,Mexico,Second gen.,Third gen.,Chloroplast info,Morphology,First gen.,Nuclear info
3691,Toxicodendron radicans,,https://scholar.google.com/scholar?&q=Toxicode...,,,Auto,,,,,,
3692,Toxicodendron radicans (poison ivy) leaf and r...,,https://scholar.google.com/scholar?&q=Toxicode...,,,Auto,,,,,,
3992,double digest RAD data from Berberis spp. (Ber...,,https://scholar.google.com/scholar?&q=double+d...,,,Auto,,,,,,


In [18]:
df_final.to_csv("final_database_SRA.csv")

In [19]:
df_final.shape

(153, 12)