In [2]:
import pandas as pd
import os
import base64
import json
import requests
from io import StringIO

In [None]:
username = 'brendancapple'
repository_name = 'FloraFrontier'


In [4]:
def github_read_file(username, repository_name, file_path, github_token=None):
    headers = {}
    if github_token:
        headers['Authorization'] = f"token {github_token}"
        
    url = f'https://api.github.com/repos/{username}/{repository_name}/contents/{file_path}'
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    data = r.json() 
    file_content = data['content']
    file_content_encoding = data.get('encoding')
    if file_content_encoding == 'base64':
        file_content = base64.b64decode(file_content).decode()

    return file_content

def load_distributionData(scientific):
    content = github_read_file(username, repository_name, "data/DistributionData_" + scientific + ".csv", github_token)
    output = pd.read_csv(StringIO(content), sep=',', header=1)
    return output


In [106]:
load_list = ['triticumAestivum', 'brassicaRapa', 'glycineMax', 'lactucaSativa', 'solanumLycopersicum', 'solanumTuberosum']
distribution_list = []

for plant in load_list:
    distribution_list.append(load_distributionData(plant))
    print(plant)

# load_distributionData("triticumAestivum")

# distribution_list[0]
len(distribution_list)

triticumAestivum
brassicaRapa
glycineMax
lactucaSativa
solanumLycopersicum
solanumTuberosum


6

In [124]:
results_dictionary = dict()
state_results_dictionary = dict()

agreed_counties = set()
agreed_states = set()

for distribution in distribution_list:
    counties_present = set()
    states_present = set()

    print("---NEW DISTRIBUTION---")
    # distribution.apply(dict_handling)
    for index, row in distribution.iterrows():
        # print(row['County'], row['State'] , row['Country'])
        key = str(row['County']) + " - " + str(row['State']) + " - " + str(row['Country'])
        state_key = str(row['State']) + " - " + str(row['Country'])

        counties_present.add(key)
        states_present.add(state_key)

        print(key)
        print(state_key)
        if key in results_dictionary:
            results_dictionary[key] = results_dictionary[key] + 1
            # print(">> key++")
        else:
            results_dictionary[key] = 1
            # print(">> key=1")

        if state_key in state_results_dictionary:
            state_results_dictionary[state_key] = state_results_dictionary[state_key] + 1
            # print(">> state_key++")
        else:
            state_results_dictionary[state_key] = 1
            # print(">> state_key=1")

    if len(agreed_counties) == 0:
        agreed_counties = counties_present
        agreed_states = states_present
    else:
        temp_counties = set()
        temp_states = set()
        for county in agreed_counties:
            if county in counties_present:
                temp_counties.add(county)
        agreed_counties = counties_present

        for state in agreed_states:
            if state in states_present:
                temp_states.add(state)
        agreed_states = temp_states



---NEW DISTRIBUTION---
nan - Alberta - Canada
Alberta - Canada
nan - Alabama - United States
Alabama - United States
Baldwin - Alabama - United States
Alabama - United States
Conecuh - Alabama - United States
Alabama - United States
Lee - Alabama - United States
Alabama - United States
Limestone - Alabama - United States
Alabama - United States
Marshall - Alabama - United States
Alabama - United States
Mobile - Alabama - United States
Alabama - United States
Montgomery - Alabama - United States
Alabama - United States
Pickens - Alabama - United States
Alabama - United States
Tuscaloosa - Alabama - United States
Alabama - United States
nan - British Columbia - Canada
British Columbia - Canada
nan - Alaska - United States
Alaska - United States
Anchorage - Alaska - United States
Alaska - United States
Kodiak Island - Alaska - United States
Alaska - United States
nan - Manitoba - Canada
Manitoba - Canada
nan - New Brunswick - Canada
New Brunswick - Canada
nan - Arizona - United States
Ari

In [125]:
total_locations = sum(results_dictionary.values())
total_locations

1865

In [128]:
agreed_states

{'Alabama - United States',
 'Illinois - United States',
 'Massachusetts - United States',
 'Michigan - United States',
 'Missouri - United States',
 'New York - United States',
 'Ohio - United States',
 'Ontario - Canada',
 'Pennsylvania - United States'}

In [113]:
combined_distribution = pd.DataFrame(results_dictionary.items(), columns = ['County', 'Frequency'])
combined_distribution = combined_distribution.sort_values(by='Frequency', ascending=False)

state_combined_distribution = pd.DataFrame(state_results_dictionary.items(), columns = ['State', 'Frequency'])
state_combined_distribution = state_combined_distribution.sort_values(by='Frequency', ascending=False)

In [114]:
combined_distribution.head(20)

Unnamed: 0,County,Frequency
238,Berkshire - Massachusetts - United States,6
112,nan - Illinois - United States,6
357,nan - New York - United States,6
288,nan - Missouri - United States,6
445,Northampton - Pennsylvania - United States,6
1,nan - Alabama - United States,6
443,Montgomery - Pennsylvania - United States,6
403,nan - Ohio - United States,6
248,nan - Michigan - United States,6
440,Lehigh - Pennsylvania - United States,6


In [115]:
state_combined_distribution.head(20)

Unnamed: 0,State,Frequency
24,Illinois - United States,151
29,Louisiana - United States,146
48,Pennsylvania - United States,106
8,California - United States,91
43,New York - United States,80
56,Virginia - United States,79
52,Tennessee - United States,74
45,Ohio - United States,73
33,Michigan - United States,63
36,Missouri - United States,53
