In [21]:
import pandas as pd
import base64
import csv
import requests
from io import StringIO

In [None]:
username = 'brendancapple'
repository_name = 'FloraFrontier'


In [23]:
def github_read_file(username, repository_name, file_path, github_token=None):
    headers = {}
    if github_token:
        headers['Authorization'] = f"token {github_token}"
        
    url = f'https://api.github.com/repos/{username}/{repository_name}/contents/{file_path}'
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    data = r.json() 
    file_content = data['content']
    file_content_encoding = data.get('encoding')
    if file_content_encoding == 'base64':
        file_content = base64.b64decode(file_content).decode()

    return file_content

def load_distributionData(scientific):
    content = github_read_file(username, repository_name, "data/DistributionData_" + scientific + ".csv", github_token)
    output = pd.read_csv(StringIO(content), sep=',', header=1)
    return output

def load_searchData(state):
    content = github_read_file(username, repository_name, "data/SearchResults_" + state + ".csv", github_token)
    output = pd.read_csv(StringIO(content), sep=',', header=1)
    return output

In [24]:
load_list = ['triticumAestivum', 'brassicaRapa', 'glycineMax', 'lactucaSativa', 'solanumLycopersicum', 'solanumTuberosum']
distribution_list = []

for plant in load_list:
    distribution_list.append(load_distributionData(plant))
    print(plant)

# load_distributionData("triticumAestivum")

# distribution_list[0]
len(distribution_list)

triticumAestivum
brassicaRapa
glycineMax
lactucaSativa
solanumLycopersicum
solanumTuberosum


6

In [25]:
results_dictionary = dict()
state_results_dictionary = dict()

agreed_counties = set()
agreed_states = set()

for distribution in distribution_list:
    counties_present = set()
    states_present = set()

    print("---NEW DISTRIBUTION---")
    # distribution.apply(dict_handling)
    for index, row in distribution.iterrows():
        # print(row['County'], row['State'] , row['Country'])
        key = str(row['County']) + " - " + str(row['State']) + " - " + str(row['Country'])
        state_key = str(row['State'])

        counties_present.add(key)
        states_present.add(state_key)

        print(key)
        print(state_key)
        if key in results_dictionary:
            results_dictionary[key] = results_dictionary[key] + 1
            # print(">> key++")
        else:
            results_dictionary[key] = 1
            # print(">> key=1")

        if state_key in state_results_dictionary:
            state_results_dictionary[state_key] = state_results_dictionary[state_key] + 1
            # print(">> state_key++")
        else:
            state_results_dictionary[state_key] = 1
            # print(">> state_key=1")

    if len(agreed_counties) == 0:
        agreed_counties = counties_present
        agreed_states = states_present
    else:
        temp_counties = set()
        temp_states = set()
        for county in agreed_counties:
            if county in counties_present:
                temp_counties.add(county)
        agreed_counties = counties_present

        for state in agreed_states:
            if state in states_present:
                temp_states.add(state)
        agreed_states = temp_states



---NEW DISTRIBUTION---
nan - Alberta - Canada
Alberta
nan - Alabama - United States
Alabama
Baldwin - Alabama - United States
Alabama
Conecuh - Alabama - United States
Alabama
Lee - Alabama - United States
Alabama
Limestone - Alabama - United States
Alabama
Marshall - Alabama - United States
Alabama
Mobile - Alabama - United States
Alabama
Montgomery - Alabama - United States
Alabama
Pickens - Alabama - United States
Alabama
Tuscaloosa - Alabama - United States
Alabama
nan - British Columbia - Canada
British Columbia
nan - Alaska - United States
Alaska
Anchorage - Alaska - United States
Alaska
Kodiak Island - Alaska - United States
Alaska
nan - Manitoba - Canada
Manitoba
nan - New Brunswick - Canada
New Brunswick
nan - Arizona - United States
Arizona
Apache - Arizona - United States
Arizona
Coconino - Arizona - United States
Arizona
Maricopa - Arizona - United States
Arizona
Pima - Arizona - United States
Arizona
Santa Cruz - Arizona - United States
Arizona
nan - Arkansas - United Stat

In [26]:
total_locations = sum(results_dictionary.values())
total_locations

1865

In [27]:
agreed_states

{'Alabama',
 'Illinois',
 'Massachusetts',
 'Michigan',
 'Missouri',
 'New York',
 'Ohio',
 'Ontario',
 'Pennsylvania'}

In [28]:
combined_distribution = pd.DataFrame(results_dictionary.items(), columns = ['County', 'Frequency'])
combined_distribution = combined_distribution.sort_values(by='Frequency', ascending=False)

state_combined_distribution = pd.DataFrame(state_results_dictionary.items(), columns = ['State', 'Frequency'])
state_combined_distribution = state_combined_distribution.sort_values(by='Frequency', ascending=False)

In [29]:
combined_distribution.head(20)

Unnamed: 0,County,Frequency
238,Berkshire - Massachusetts - United States,6
112,nan - Illinois - United States,6
357,nan - New York - United States,6
288,nan - Missouri - United States,6
445,Northampton - Pennsylvania - United States,6
1,nan - Alabama - United States,6
443,Montgomery - Pennsylvania - United States,6
403,nan - Ohio - United States,6
248,nan - Michigan - United States,6
440,Lehigh - Pennsylvania - United States,6


In [30]:
state_combined_distribution.head(20)

Unnamed: 0,State,Frequency
24,Illinois,151
29,Louisiana,146
48,Pennsylvania,106
8,California,91
43,New York,80
56,Virginia,79
52,Tennessee,74
45,Ohio,73
33,Michigan,63
36,Missouri,53


In [52]:
state_search_results = []

for state in list(agreed_states):
    print(state)
    state_results = load_searchData(state)

    if len(state_search_results) == 0:
        state_search_results = state_results['Scientific Name']
        continue
    
    temp_results = []
    for plant in state_results['Scientific Name']:
        if plant in state_search_results:
            temp_results.append(plant)
    state_search_results = temp_results

clean_state_search_results = []
for name in state_search_results:
    short_name = name
    # if ' var.' in short_name:
    #     short_name = short_name[0:short_name.find(' var.')]
    if ' (' in short_name:
        short_name = short_name[:short_name.find(' (')]
    elif ' L.' in short_name:
        short_name = short_name[:short_name.find(' L.')]
    
    clean_state_search_results.append(short_name)

Missouri
Alabama
Ohio
Ontario
Michigan
Massachusetts
Illinois
New York
Pennsylvania


In [53]:
clean_state_search_results

['Abies balsamea',
 'Abies balsamea',
 'Abutilon theophrasti Medik.',
 'Acalypha deamii',
 'Acalypha gracilens A. Gray',
 'Acalypha ostryifolia Riddell',
 'Acalypha rhomboidea Raf.',
 'Acalypha virginica',
 'Acanthospermum australe',
 'Acer campestre',
 'Acer ginnala Maxim.',
 'Acer negundo',
 'Acer negundo',
 'Acer nigrum Michx. f.',
 'Acer palmatum Thunb.',
 'Acer pensylvanicum',
 'Acer platanoides',
 'Acer pseudoplatanus',
 'Acer rubrum',
 'Acer rubrum',
 'Acer rubrum',
 'Acer saccharinum',
 'Acer saccharum Marshall',
 'Acer saccharum Marshall var. saccharum ',
 'Acer spicatum Lam.',
 'Achillea millefolium',
 'Achillea millefolium',
 'Achillea millefolium',
 'Achillea ptarmica',
 'Acicarpha tribuloides Juss.',
 'Acinos arvensis',
 'Aconitum napellus',
 'Aconitum reclinatum A. Gray',
 'Aconitum uncinatum',
 'Aconitum uncinatum L. ssp. muticum',
 'Acorus americanus',
 'Acorus calamus',
 'Actaea pachypoda Elliott',
 'Actaea podocarpa DC.',
 'Actaea racemosa',
 'Actaea racemosa',
 'Acta

In [55]:
with open("C:\\Users\\brend\Downloads\\FinalList.csv", 'w', encoding="UTF-8") as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(clean_state_search_results)