In [133]:
import re
import os
from collections import namedtuple

In [2]:
import markdown

In [3]:
text_dir = './tmp/especies'

In [4]:
text_file = text_dir + '/ameerega-flavopicta.md'

# Get text for each species profile

### Google Drive

In [5]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from pydrive.auth import ServiceAccountCredentials

In [6]:
credentialsFile = './credentials.json'
scope = ['https://www.googleapis.com/auth/drive']

In [7]:
gauth = GoogleAuth()
gauth.credentials = ServiceAccountCredentials.from_json_keyfile_name(credentialsFile,scope)

In [8]:
drive = GoogleDrive(gauth)

In [92]:
PROJECT_ID = '0B0M5IL0AEOXidHZmTjZzMHlLLWc'
PROFILES_ROOT_ID = drive.ListFile({'q': f"title contains 'Perfil' and '{PROJECT_ID}' in parents"}).GetList()[0]['id']

In [130]:
def getFamiliesFoldersIds():
    return {  f['title']:f['id'] 
              for f in drive.ListFile({'q': f"'{PROFILES_ROOT_ID}' in parents"}).GetList()
              if re.match('^[a-zA-Z]*ae$',f['title'])  }


def getFamilyDocuments(family, family_folder_id):
    return drive.ListFile({'q':f"'{family_folder_id}' in parents and mimeType contains 'document'"}).GetList() 


In [151]:
# Fetches all documents for each family

def fetchProfilesDocuments():
    
    os.makedirs('./tmp/especies')
    os.makedirs('./tmp/familias')
    for family,folder_id in getFamiliesFoldersIds().items():
        print(f"Fetching documents for family {family}")
        for document in getFamilyDocuments(family,folder_id):
            print(f"\tFetching document: {document['title']}")
            if document['title']=='familia':
                document.GetContentFile(f"./tmp/familias/{family.lower()}.md", mimetype="text/plain")
            else:
                document.GetContentFile(f"./tmp/especies/{document['title']}.md", mimetype="text/plain")

In [152]:
fetchProfilesDocuments()

Fetching documents for family odontophrynidae
	Fetching document: familia
	Fetching document: proceratophrys-rotundipalpebra
	Fetching document: odontophrynus-salvatori
	Fetching document: odontophrynus-cultripes
	Fetching document: proceratophrys-goyana
Fetching documents for family microhylidae
	Fetching document: familia
	Fetching document: elachistocleis-cesarii
Fetching documents for family leptodactylidae
	Fetching document: leptodactylus-latrans
	Fetching document: leptodactylus-fuscus
	Fetching document: leptodactylus-furnarius
	Fetching document: leptodactylus-mystacinus
	Fetching document: leptodactylus-mystaceus
	Fetching document: familia
	Fetching document: pseudopaludicola-giarettai
	Fetching document: physalaemus-marmoratus
	Fetching document: leptodactylus-podicipinus
	Fetching document: leptodactylus-chaquensis
	Fetching document: adenomera-juikitam
	Fetching document: adenomera-cotuba
	Fetching document: pseudopaludicola-ternetzi
	Fetching document: pseudopaludicola-s

# Species profile text

In [261]:
def getSpeciesProfileStructure(txt):
    main_desc = re.findall('^([^#]*)',txt,flags=re.DOTALL)[0].strip()
    named_sections = [ (n.strip(),d.strip()) for n,d in re.findall('#+([\w\s]{1,30})\n([^#]+)',txt,flags=re.DOTALL)]
    structured_text = namedtuple('ProfileText',['main_desc','named_sections'])
    return structured_text(main_desc,named_sections)
    return namedtuple('ProfileText', ['main_desc','named_sections'],main_desc,named_sections)

def profileText_to_markdown(profile_txt_str):
    
    resText = ''
    resText += profile_txt_str.main_desc
    
    for name,content in profile_txt_str.named_sections:
        if name.lower()=='curiosidades':
            resText += f'''
<div class="col-lg-12">
  <div class="jumbotron">
    <h1 class="display-4">Você sabia?</h1>
    <hr class="my-4">
    <p>{markdown.markdown(content)}</p>
  </div>
</div>
'''
        
        else:
            resText += f'\n#### {name}\n{content}'
            
    return resText

In [153]:
import glob

In [262]:
for spfilePath in glob.glob('./tmp/especies/*'):
    species_slug = re.findall('([\w-]*).md$',spfilePath)[0]
    
    with open(spfilePath) as f:
        txt = f.read()
        profile_data = getSpeciesProfileStructure(txt)

    with open(f'../especies/{species_slug}.md','w') as f:
        f.write(profileText_to_markdown(profile_data))
    


In [242]:
species_slug = 'bokermannohyla-pseudopseudis'
text_file = f'../especies/{species_slug}.md'

with open(text_file) as f:
    txt = f.read()
    profile_data = getSpeciesProfileStructure(txt)

with open(f'../especies/{species_slug}-1.md','w') as f:
    f.write(profileText_to_markdown(profile_data))