In [2]:
import pandas as pd
import os
import base64
import json
import requests
from io import StringIO

In [None]:
username = 'brendancapple'
repository_name = 'FloraFrontier'


In [4]:
def github_read_file(username, repository_name, file_path, github_token=None):
    headers = {}
    if github_token:
        headers['Authorization'] = f"token {github_token}"
        
    url = f'https://api.github.com/repos/{username}/{repository_name}/contents/{file_path}'
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    data = r.json() 
    file_content = data['content']
    file_content_encoding = data.get('encoding')
    if file_content_encoding == 'base64':
        file_content = base64.b64decode(file_content).decode()

    return file_content

def load_distributionData(scientific):
    content = github_read_file(username, repository_name, "data/DistributionData_" + scientific + ".csv", github_token)
    output = pd.read_csv(StringIO(content), sep=',', header=1)
    return output


In [63]:
load_list = ['triticumAestivum', 'brassicaRapa', 'glycineMax', 'lactucaSativa', 'solanumLycopersicum', 'solanumTuberosum']
distribution_list = []

for plant in load_list:
    distribution_list.append(load_distributionData(plant))
    print(plant)

# load_distributionData("triticumAestivum")

# distribution_list[0]

triticumAestivum
brassicaRapa
glycineMax
lactucaSativa
solanumLycopersicum
solanumTuberosum


In [78]:
results_dictionary = dict()

def dict_handling(a):
    print(str(a) + "\n")
    key = str(a[0]) + " - " + str(a[1])

    if key in results_dictionary:
        results_dictionary[key] = results_dictionary[key] + 1
    else:
        results_dictionary[key] = 1

for distribution in distribution_list:
    # distribution.apply(dict_handling)
    for index, row in distribution.iterrows():
        # print(row['County'], row['State'] , row['Country'])
        key = str(row['County']) + " - " + str(row['State']) + " - " + str(row['Country'])

        if key in results_dictionary:
            results_dictionary[key] = results_dictionary[key] + 1
        else:
            results_dictionary[key] = 1


In [87]:
total_locations = sum(results_dictionary.values())
total_locations

1865

In [90]:
combined_distribution = pd.DataFrame(results_dictionary.items(), columns = ['County', 'Frequency'])
combined_distribution = combined_distribution.sort_values(by='Frequency', ascending=False)

combined_distribution.head(20)

Unnamed: 0,County,Frequency
238,Berkshire - Massachusetts - United States,6
112,nan - Illinois - United States,6
357,nan - New York - United States,6
288,nan - Missouri - United States,6
445,Northampton - Pennsylvania - United States,6
1,nan - Alabama - United States,6
443,Montgomery - Pennsylvania - United States,6
403,nan - Ohio - United States,6
248,nan - Michigan - United States,6
440,Lehigh - Pennsylvania - United States,6
