In [46]:
import pandas as pd 
import numpy as np 
import scipy.stats as stats
import matplotlib.pyplot as plt 

# Revisit registry data for updates
df_registry = pd.read_csv('../data/CI5-XI/registry.txt', sep = "\t", header=0, encoding='iso-8859-1') # '../data/CI5-XI/cancer_summary.txt'
    # cleansing
df_registry.columns = (['REGISTRY', 'country_long'])


In [47]:

def clean_country_region (country_col):
    '''
    country_col is a string value
    returns
    clean_lst with extra spaces, asterisk, and date range e.g. (2008-2012) removed
    preserves country and region and demographic breakdown
    e.g. country_region should be e.g. "Ecuador, Quito"
    '''
    str1 = country_col.strip().replace('*','')
    str2 = str1.split('(')[0].strip()

    #print(str2)
    return str2

def get_country_star(country_col):
    '''
    country_col is a Series/column which we'll treat as an iterable
    returns
    has star (bool)
    
    '''
    
    #print(country_col)
    idx = country_col.find('*')
    if idx == -1:
        return False
    else:
        return True

def get_country(country_col):
    '''
    Expects string with lots of extras
    returns only the country name
    '''
    str1 = country_col.strip() 
    str2 = str1.split(',')[0].replace('*','')
    str3 = str2.split('(')[0].strip()
    str4 = str3.split(':')[0]
    str5 = str4.split(';')[0]
    #print(str2)
    return str5


def set_national(country_col):
    # Returns true if the country long version let's you know it's national by not having : or ;
    if (country_col.find(':') == -1 and country_col.find(';') == -1 and country_col.find(',') == -1):
        return 1
    else:
        return 0

def set_is_subset(national_exists, is_national):
    # expects two integer values, 1 or 0 for T/F
    # Returns 0 if the national_exists is 1 and is_national is 0, else 1
    if (national_exists == 0 and is_national == 0):
        return 1
    else:   
        return 0

def std_country_region (country_region_col):
    '''
    country_col is a string value, standardize format
    ARGS:
        country_col String to transform
    RETURN:
        ISO recognized country name 
    '''
    dict = { 
        'Australian Capital Territory' : 'Australia, Australian Capital Territory',
        'Greater Poland':'Poland',
        'Iran':'Iran, Islamic Replublic of',
        'Republic of Korea':'Korea (South)',                  
        'South Australia':'Australia, South Australia',
        'The Netherlands':'Netherlands',
        'UK':'United Kingdom',
        'USA':'United States of America',
        'Western Australia':'Australia, Western Australia'
    }
    if (dict.get(country_region_col) == None):
        return country_region_col
    else:
        return dict.get(country_region_col)

def munge_registry(df_registry):        

    df_registry['country_region'] = list(map(clean_country_region, df_registry['country_long']))
    # one off fix for Australia to make it roll up as a single country
    df_registry['country_region'] = list(map(std_country_region, df_registry['country_region']))

    df_registry['star'] = list(map(get_country_star, df_registry['country_long']))
    # change this to get country from cleansed country_region 
    df_registry['country_name'] = list(map(get_country, df_registry['country_region'])) 

    df_registry['is_national'] = list(map(set_national, df_registry['country_region']))
    # Some countries have cancer data at the national level and a sub-region by registry.  Others have regions that need to be summed.
    # Detect national registry by format of country name not containing ;:, then exclude all non national subsets for that country
    df_reg_group2 =  df_registry.groupby('country_name')['is_national'].sum().reset_index()
    df_registry2 = pd.merge(df_registry, df_reg_group2, on=['country_name'], suffixes=['', '_exists'])
    df_registry2['is_subset'] = list(map (set_is_subset, df_registry2['is_national_exists'], df_registry2['is_national']) )

    # use mask to eliminate redundant registries.  count goes from 464 to 273. 
    df_reduced = df_registry2[(df_registry2['is_national'] == 1 ) | (df_registry2['is_subset'] == 1)] 
    df_reduced.set_index('REGISTRY')
    return df_reduced

In [48]:
df_reg = munge_registry(df_registry)

In [49]:
df_reg 

Unnamed: 0,REGISTRY,country_long,country_region,star,country_name,is_national,is_national_exists,is_subset
0,101200399,...,"Algeria, Batna",True,Algeria,0,0,1
1,140400299,...,"Kenya, Nairobi",True,Kenya,0,0,1
2,169000099,...,Seychelles,False,Seychelles,1,1,0
3,171000199,*South ...,"South Africa, Eastern Cape",True,South Africa,0,0,1
4,180000299,*Uga...,"Uganda, Kyadondo County",True,Uganda,0,0,1
...,...,...,...,...,...,...,...,...
447,582602099,...,"UK, Scotland",False,UK,0,0,1
448,582604099,...,"UK, Wales",False,UK,0,0,1
449,603600099,...,Australia,False,Australia,1,1,0
460,625009399,...,New Caledonia,True,New Caledonia,1,1,0


In [50]:
# Let's update country_name directly?
df_reg[df_reg.loc[:,'country_name'] == 'South Australia']

Unnamed: 0,REGISTRY,country_long,country_region,star,country_name,is_national,is_national_exists,is_subset


In [51]:
def std_country (country_col):
    '''
    country_col is a string value
    ARGS:
        country_col String to transform
    RETURN:
        ISO recognized country name 
    '''
    dict = { 
        'Australian Capital Territory' : 'Australia',
        'Greater Poland':'Poland',
        'Iran':'Iran, Islamic Replublic of',
        'Republic of Korea':'Korea (South)',                  
        'South Australia':'Australia',
        'The Netherlands':'Netherlands',
        'UK':'United Kingdom',
        'USA':'United States of America',
        'Western Australia':'Australia dupe'
    }
    if (dict.get(country_col) == None):
        return country_col
    else:
        return dict.get(country_col)
        

std_country('USA')

'United States of America'

In [53]:
df_registry[df_registry['country_name'].isin(['Canada','Australia','Australian Capital Territory', 'South Australia', 'Western Australia'])]

Unnamed: 0,REGISTRY,country_long,country_region,star,country_name,is_national
37,312400199,...,"Canada, Alberta",True,Canada,0
38,312400299,Cana...,"Canada, British Columbia",False,Canada,0
39,312400399,...,"Canada, Manitoba",False,Canada,0
40,312400599,*C...,"Canada, New Brunswick",True,Canada,0
41,312400899,"*Canada, Newfo...","Canada, Newfoundland and Labrador",True,Canada,0
42,312400499,"*Canada, N...","Canada, Northwest Territories",True,Canada,0
43,312400699,...,"Canada, Nova Scotia",True,Canada,0
44,312401399,...,"Canada, Nunavut",True,Canada,0
45,312401099,...,"Canada, Ontario",True,Canada,0
46,312400799,"*Canada, ...","Canada, Prince Edward Island",True,Canada,0
