## String-Matching Parish Detection
    Project: Capstone
    Author:  Benedikt Graf
    Version: 11-22-2021

### Import Packages abd Setup Environemnt

In [1]:
### BASIC
import numpy as np # for arrays
import pandas as pd # for panel data
import random # for random number generation
import matplotlib.pyplot as plt

### WORD MATCHING
from fuzzywuzzy import fuzz
import jaro
import unidecode

### MISC
import os # miscellaneous operating system interfaces
from IPython.display import display, HTML # to adjust display preferences
import warnings
warnings.filterwarnings('ignore')

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

### Importing the Clean Data

In [3]:
full_data = pd.read_csv("./data/tocodeparish/cleaned_data.csv", index_col=0)

In [4]:
# all birthcounty options
full_data.birthcounty.unique()

array(['Uppsala', 'Orebro', 'Malmohus', 'Vasternorrland', 'Ostergotland',
       'Kalmar', nan, 'Varmland', 'Halland', 'Skaraborg', 'Sodermanland',
       'Jonkoping', 'Gavleborg', 'Kristianstad', 'Norrbotten',
       'Goteborg och Bohus', 'Kronoberg', 'Dalarna', 'Gotland',
       'Vasterbotten', 'Vastmanland', 'Jamtland', 'Alvsborg', 'Blekinge',
       'Stockholm', 'Smaland', 'Vastra Gotaland', 'Skane', 'Uppland',
       'Lappland', 'Norrland', 'Oland', 'Norrbotten or Vasterbotten'],
      dtype=object)

Subsetting by birthcounty

### Data Format and Subset

In [5]:
def formater(data):
    '''
    formats data (subset, lowercase, add columns)
    '''
    
    # Formatting all place names to lower case
    data = data[["fsid", "ns_birthplace"]]
    
    data['ns_birthplace'] = [x.lower() for x in data['ns_birthplace']]
    data['word_search'] = "Not identified"
    data["ls_search"] = "Not identified"
    data["ls_ratio"] = np.NaN
    data["jw_search"] = "Not identified"
    data["jw_ratio"] = np.NaN
    
    display(data.head())
    
    print(data.shape)
    
    return(data)

In [6]:
def parish_data_selecter(parish_code, full_data):
    '''
    subsets parish data based on Swedish "parish_code" and 
    formats data
    '''
    
    # Norrbotten (BD) #25 in meta data
    data_bd = full_data[(full_data.birthcounty == 'Norrbotten') |
                        (full_data.birthcounty == 'Norrbotten or Vasterbotten') |
                        (full_data.birthcounty == 'Norrland') | 
                        (full_data.birthcounty == 'Lappland') | 
                        (full_data.birthcounty == np.nan)].reset_index(drop=True)

    # Norrbotten (AC) #24
    data_ac = full_data[(full_data.birthcounty == 'Vasterbotten')].reset_index(drop=True)

    # Jämtland (Z) #23
    data_z = full_data[(full_data.birthcounty == 'Jamtland')].reset_index(drop=True)

    # Västernorrland (Y) #22
    data_y = full_data[(full_data.birthcounty == 'Vasternorrland')].reset_index(drop=True)

    # Gävleborg (X) #21
    data_x = full_data[(full_data.birthcounty == 'Gavleborg')].reset_index(drop=True)

    # Dalarna (W) #20
    data_w = full_data[(full_data.birthcounty == 'Dalarna')].reset_index(drop=True)
    
    
    if parish_code == "bd":
        data = data_bd
    if parish_code == "ac":
        data = data_ac
    if parish_code == "z":
        data = data_z
    if parish_code == "y":
        data = data_y
    if parish_code == "x":
        data = data_x
    if parish_code == "w":
        data = data_w
    
    data = formater(data=data)
    
    return(data)
    

### String-Match Functions

In [7]:
def word_parser(parish_dict, data):
    '''
    naive word parser
    (does not take space between words into consideration)
    '''
    for key in parish_dict:
        for i in parish_dict[key]:
            data["word_search"] = np.where(data['ns_birthplace'].str.contains(i), key, data["word_search"])
          
    return(data)

In [8]:
def word_parser2(data, parish_dict):
    '''
    find exact string-search matches
    '''
    ### ITERATE OVER OBSERVATIONS
    for i in range(0,len(data)):
        string = data['ns_birthplace'].loc[i]
        ## STRING TO LIST
        s_list = string.split()
        
        ## REMOVE USELESS WORDS
        removal_list = ["sweden", "sverige", "gävleborg", "gavleborg", "gavlebg"]
        for r in removal_list:
            if r in s_list: s_list.remove(r)
        
        ## DF FOR RESULTS OF OBERVATION i

        ## ITERATE OVER WORDS IN OBSERVATION i
        for j in range(0,len(s_list)):
            
            for key in parish_dict:
                
                for place in parish_dict[key]:
                
                    ## EXACT STRING SEARCH
                    if place == s_list[j]: 
                        data['word_search'].loc[i] = key
                        
    return(data)

In [9]:
def edit_dist_parser(data,parish_dict,ls_thresh,jw_thresh):
    '''
    computes Levenshtein and Jaro-Winkler ratios for each observation
    appends results above ratio threshold to dataframe
    assigns parishes with max ratio
    '''
    ### ITERATE OVER OBSERVATIONS
    for i in range(0,data.shape[0]):
        string = data['ns_birthplace'].loc[i]
        ## REMOVE ACCCENTS
        unaccented_string = unidecode.unidecode(string)
        ## STRING TO LIST
        s_list = unaccented_string.split()
        
        ## REMOVE USELESS WORDS
        removal_list = ["sweden", "sverige", "gävleborg", "gavleborg", "gavlebg"]
        for r in removal_list:
            if r in s_list: s_list.remove(r)
        
        ## DF FOR RESULTS OF OBERVATION i
        idf = pd.DataFrame(columns=['obs_j','ls_ratio', 'ls_parish', 'jw_ratio', "jw_parish"])

        ## ITERATE OVER WORDS IN OBSERVATION i
        for j in range(0,len(s_list)):
            
            for key in parish_dict:
                
                if parish_dict[key][0] != "DO_NOT_SEARCH":
                    
                    for place in parish_dict[key]:
                
                        ## LEVENSHTEIN DISTANCE
                        ls_ratio = fuzz.ratio(place, s_list[j])/100
                        if ls_ratio > ls_thresh:
                            idf = idf.append({'obs_j': s_list[j],
                                              'ls_ratio': ls_ratio, 
                                              'ls_parish': key}, 
                                             ignore_index=True)

                        ## JARO-WINKLER DISTANCE
                        jw_ratio = jaro.jaro_winkler_metric(place, s_list[j])
                        if (jw_ratio) > jw_thresh:
                            idf = idf.append({'obs_j': s_list[j],
                                              'jw_ratio': jw_ratio, 
                                              'jw_parish': key}, 
                                              ignore_index=True)
            
        ### FIND BEST RESULTS FOR OBSERVATION i
        #display(idf)
        if (idf.ls_ratio.count()) > 0:
            max_ls = idf.ls_ratio.idxmax(axis=0)
            data['ls_search'].loc[i] = idf['ls_parish'].loc[max_ls]
            data['ls_ratio'].loc[i] = idf['ls_ratio'].loc[max_ls]

        if (idf.jw_ratio.count()) > 0:    
            max_jw = idf.jw_ratio.idxmax(axis=0)
            data['jw_search'].loc[i] = idf['jw_parish'].loc[max_jw]  
            data['jw_ratio'].loc[i] = idf['jw_ratio'].loc[max_jw]
            
    return(data)

### Results

In [10]:
def stats(data_parish):
    '''
    calculates and prints statistics for matches
    '''
    
    print("Observations:", data_parish.shape[0], "\n")
    
    for m in ['word_search', 'ls_search', 'jw_search']:
    
        identified = data_parish[data_parish[m] != "Not identified"].shape[0]
        not_identified = data_parish[data_parish[m] == "Not identified"].shape[0]
        percent_identified = round(identified/data_parish.shape[0],3)
        
        print(color.BOLD + str(m) + color.END)
        print("# Identified:",identified)
        print("# Not Identified:",not_identified)
        print("% Identified:",percent_identified, "\n")
    
#     ls_identified = data_parish[data_parish['ls_search'] != "Not identified"].shape[0]
#     ls_not_identified = data_parish[data_parish['ls_search'] == "Not identified"].shape[0]
#     ls_percent_identified = round(ls_identified/data_parish.shape[0],3)

#     jw_identified = data_parish[data_parish['jw_search'] != "Not identified"].shape[0]
#     jw_not_identified = data_parish[data_parish['jw_search'] == "Not identified"].shape[0]
#     jw_percent_identified = round(jw_identified/data_parish.shape[0],3)
    
    
   
#     print(color.BOLD + 'Word-Search' + color.END)
#     print("# Identified:",identified)
#     print("# Not Identified:",not_identified)
#     print("% Identified:",percent_identified, "\n")
    
#     print(color.BOLD + 'Levenshtein' + color.END)
#     print("# Identified:",ls_identified)
#     print("# Not Identified:",ls_not_identified)
#     print("% Identified:",ls_percent_identified, "\n")
    
#     print(color.BOLD + 'Jaro-Winkler' + color.END)
#     print("# Identified:",jw_identified)
#     print("# Not Identified:",jw_not_identified)
#     print("% Identified:",jw_percent_identified, "\n")

In [11]:
def result_list(data):
    '''
    lists results on the parish level
    '''
    
    # collect metrics
    word_res = data.groupby("word_search").count()['fsid'].sort_index()
    ls_res   = data.groupby("ls_search").count()['fsid'].sort_index()
    jw_res   = data.groupby("jw_search").count()['fsid'].sort_index()

    # combine metrics
    res_table = pd.concat((word_res, ls_res, jw_res), axis=1)
    res_table = pd.DataFrame(res_table)
    res_table.columns = ['String-Search', 'LS Dist', 'JW Dist']
  
    #pd.DataFrame(by_parish["ns_birthplace"].sort_values(ascending=False,))

    return(res_table)

### Parish Place Name Dictionaries

In [12]:
def parish_selecter(parish_code):
    '''
    defines and selects parish place name dictionaries
    '''
    
    # Västernorrland
    parish_dict_y = {
                        "ALNÖ"             : ["alnö", "alno"],
                        "ANUNDSJÖ"         : ["anundsjö", "anundsjo"],
                        "ARNÄS"            : ["arnäs", "arnas"],
                        "ATTMAR"           : ["attmar", "attmar"],
                        "BJÄRTRÅ"          : ["bjärtrå", "bjartra"],
                        "BJÖRNA"           : ["björna", "bjorna"],
                        "BODUM"            : ["bodum", "bodum"],
                        "BORGSJÖ"          : ["borgsjö", "borgsjo"],
                        "BOTEÅ"            : ["boteå", "botea"],
                        "DAL"              : ["dal", "dal"],
                        "ED (Y-län)"       : ["ed"],
                        "EDSELE"           : ["edsele", "edsele"],
                        "FJÄLLSJÖ (Z-län)" : ["fjällsjö", "fjallsjo"],
                        "GIDEÅ"            : ["gideå", "gidea"],
                        "GRANINGE"         : ["graninge", "graninge"],
                        "GRUNDSUNDA"       : ["grundsunda", "grundsunda"],
                        "GUDMUNDRÅ"        : ["gudmundrå", "gudmundra"],
                        "HAVERÖ"           : ["haverö", "havero"],
                        "HELGUM"           : ["helgum", "helgum"],
                        "HOLM (Y-län)"     : ["holm", "holm"],
                        "HÄGGDÅNGER"       : ["häggdånger", "haggdanger"],
                        "HÄRNÖSANDS DOMKYRKO" : ["härnösands", "harnosands","domkyrko", "domkyrko", "domkyrka"],
                        "HÄSSJÖ"           : ["hässjö", "hassjo"],
                        "HÖGSJÖ"           : ["högsjö", "hogsjo"],
                        "INDAL"            : ["indal", "indal"],
                        "JUNSELE"          : ["junsele", "junsele"],
                        "LIDEN (Y-län)"    : ["liden", "liden"],
                        "LJUSTORP"         : ["ljustorp", "ljustorp"], 
                        "LÅNGSELE"         : ["långsele", "langsele"],
                        "MULTRÅ"           : ["multrå", "multra"],
                        "NJURUNDA"         : ["njurunda", "njurunda"],
                        "NORA (Y-län)"     : ["nora", "nora"],
                        "NORDINGRÅ"        : ["nordingrå", "nordingra"],
                        "NÄTRA"            : ["nätra", "natra"],
                        "RAMSELE"          : ["ramsele", "ramsele"],
                        "RESELE"           : ["resele", "resele"],
                        "SELÅNGER"         : ["selånger", "selanger"],
                        "SIDENSJÖ"         : ["sidensjö", "sidensjo"],
                        "SJÄLEVAD"         : ["själevad", "sjalevad"],
                        "SKOG (Y-län)"     : ["skog", "skog"],
                        "SKORPED"          : ["skorped", "skorped"],
                        "SKÖN"             : ["skön", "skon"],
                        "SOLLEFTEÅ"        : ["sollefteå", "sollefteå"],
                        "STIGSJÖ"          : ["stigsjö", "stigsjo", "stigso"],
                        "STYRNÄS"          : ["styrnäs", "styrnas"],
                        "STÖDE"            : ["stöde", "stode"],
                        "SUNDSVALLS GUSTAV ADOLF" : ["sundsvall", "gustav", "gustav adolf", "sundsvall", "adolf"],
                        "SÄBRÅ"            : ["säbrå", "sabra"],
                        "SÄTTNA"           : ["sättna", "sattna"],
                        "SÅNGA (Y-län)"    : ["sånga", "sanga"],
                        "TIMRÅ"            : ["timrå", "timra"],
                        "TORP (Y-län)"     : ["torp", "torp"],
                        "TORSÅKER (Y-län)" : ["torsåker", "torsaker"],
                        "TUNA (Y-län"      : ["tuna", "tuna"],
                        "TYNDERÖ"          : ["tynderö", "tyndero"],
                        "TÅSJÖ"            : ["tåsjö", "tasjo"],
                        "ULLÅNGER"         : ["ullånger", "ullanger", "ulanger"],
                        "VIBYGGERÅ"        : ["vibyggerå", "vibyggera"],
                        "VIKSJÖ"           : ["viksjö", "viksjo"],
                        "YTTERLÄNNÄS"      : ["ytterlännäs", "ytterlannas"],
                        "ÅDALS-LIDEN"      : ["ådals-liden", "adals-liden", "ådals-liden", "adals liden"],
                        "ÖVERLÄNNÄS"       : ["överlännäs", "overlannas"]
                    }

    # Gävleborg 
    parish_dict_x = {
                        "ALFTA"            : ["alfta", "alfta"],
                        "ARBRÅ"            : ["arbrå", "arbra"],
                        "BERGSJÖ"          : ["bergsjö", "bergsjo"],
                        "BJURÅKER"         : ["bjuråker", "bjuraker"],
                        "BOLLNÄS"          : ["bollnäs", "bollnas"],
                        "DELSBO"           : ["delsbo", "delsbo"],
                        "ENÅNGER"          : ["enånger", "enanger"],
                        "FORSA (X-län)"    : ["forsa", "forsa"],
                        "FÄRILA"           : ["färila", "farila"],
                        "GNARP"            : ["gnarp", "gnarp"],
                        "GÄVLE"            : ["gävle", "gavle"],
                        "HAMRÅNGE"         : ["hamrånge", "hamrange"],
                        "HANEBO"           : ["hanebo", "hanebo"],
                        "HARMÅNGER"        : ["harmånger", "harmanger"],
                        "HASSELA"          : ["hassela", "hassela"],
                        "HEDESUNDA"        : ["hedesunda", "hedesunda"],
                        "HILLE"            : ["hille", "hille"],
                        "HUDIKSVALL"       : ["hudiksvall", "hudiksvall"],
                        "HÄLSINGTUNA"      : ["hälsingtuna", "halsingtuna"],
                        "HÖG (X-län)"      : ["hög", "hog"],
                        "IDENOR"           : ["idenor", "idenor"],
                        "ILSBO"            : ["ilsbo", "ilsbo"],
                        "JÄRVSÖ"           : ["järvsö", "jarvso"],
                        "JÄTTENDAL"        : ["jättendal", "jattendal"],
                        "LJUSDAL"          : ["ljusdal", "ljusdal"], 
                        "LOS"              : ["los"],
                        "MO (X-län)"       : ["mo"],
                        "NIANFORS"         : ["nianfors", "nianfors"],
                        "NJUTÅNGER"        : ["njutånger", "njutanger"],
                        "NORRALA"          : ["norrala", "norrala"],
                        "NORRBO"           : ["norrbo", "norrbo"],
                        "OCKELBO"          : ["ockelbo", "ockelbo"],
                        "OVANSJÖ"          : ["ovansjö", "ovansjo"],
                        "OVANÅKER"         : ["ovanåker", "ovanaker"],
                        "RENGSJÖ"          : ["rengsjö", "rengsjo"],
                        "ROGSTA"           : ["rogsta", "rogsta"],
                        "SANDVIKEN"        : ["sandviken", "sandviken"],
                        "SEGERSTA"         : ["segersta", "segersta"],
                        "SKOG (X-län)"     : ["skog", "skog"],
                        "SÖDERALA"         : ["söderala", "soderala"],
                        "SÖDERHAMN"        : ["söderhamn", "soderhamn"],
                        "TORSÅKER (X-län)" : ["torsåker", "torsaker"],
                        "TRÖNÖ"            : ["trönö", "trono"],
                        "UNDERSVIK"        : ["undersvik", "undersvik"],
                        "VALBO"            : ["valbo", "valbo"],
                        "VOXNA"            : ["voxna", "voxna"],
                        "YTTERHOGDAL"      : ["ytterhogdal", "ytterhogdal"],
                        "ÅMOT"             : ["åmot", "amot"],
                        "ÅRSUNDA"          : ["årsunda", "arsunda"],
                        "ÖSTERFÄRNEBO"     : ["österfärnebo", "osterfarnebo"]
                    }

    # Dalarna 
    parish_dict_w = {   
                        "BODA"             : ["boda", "boda"],
                        "ASPEBODA"         : ["aspeboda", "aspeboda"],
                        "AVESTA"           : ["avesta", "avesta"],
                        "BJURSÅS"          : ["bjursås", "bjursas"],
                        "BY (W-län)"       : ["by", "by"],
                        "DJURA"            : ["djura", "djura"],
                        "ENVIKEN"          : ["enviken", "enviken"],
                        "FALU KRISTINE"    : ["falu kristine", "falu"],
                        "FALU KRISTINE"    : ["falu cristine", "krsitine","floda","falun"],
                        "FOLKÄRNA"         : ["folkärna", "folkarna"],
                        "GAGNEF"           : ["gagnef", "gagnef"],
                        "GARPENBERG"       : ["garpenberg", "garpenberg"],
                        "GRANGÄRDE"        : ["grangärde", "grangarde"],
                        "GRYTNÄS"          : ["grytnäs", "grytnas"],
                        "GUSTAFS"          : ["gustafs", "gustafs"],
                        "HEDEMORA LANDS"   : ["hedemora", "hedemora"],
                        "HEDEMORA STADS"   : ["DO_NOT_SEARCH", "hedemora stads"],
                        "HOSJÖ"            : ["hosjö", "hosjo"],
                        "HUSBY (W-län)"    : ["husby", "husby"],
                        "IDRE"             : ["idre", "idre"],
                        "JÄRNA (W-län)"    : ["järna", "jarna"],
                        "LEKSAND"          : ["leksand", "leksand"],
                        "LIMA"             : ["lima", "lima"],
                        "LUDVIKA"          : ["ludvika", "ludvika"],
                        "MALINGSBO"        : ["malingsbo", "malingsbo"],
                        "MALUNG"           : ["malung", "malung"],
                        "MOCKFJÄRD"        : ["mockfjärd", "mockfjard"],
                        "MORA"             : ["mora", "mora"],
                        "NORRBÄRKE"        : ["norrbärke", "norrbarke"],
                        "NÅS"              : ["nås", "nas"],
                        "ORE"              : ["ore", "ore"],
                        "ORSA"             : ["orsa", "orsa"],
                        "RÄTTVIK"          : ["rättvik", "rattvik"],
                        "SILVBERG"         : ["silvberg", "silvberg"],
                        "SOLLERÖN"         : ["sollerön", "solleron"],
    #                   "STORA KOPPARBERG" : ["stora kopparberg", "kopparberg"],
                        "STORA SKEDVI"     : ["stora skedvi", "skedvi"],
                        "STORA TUNA"       : ["stora tuna", "tuna"],
                        "SUNDBORN"         : ["sundborn", "sundborn"],
                        "SVARTNÄS"         : ["svartnäs", "svartnas"],
                        "SVÄRDSJÖ"         : ["svärdsjö", "svardsjo"],
                        "SÄFSNÄS"          : ["säfsnäs", "safsnas"],
                        "SÄRNA"            : ["särna", "sarna"],
                        "SÄTERS LANDS"     : ["säters", "saters"],
                        "SÄTERS STADS"     : ["DO_NOT_SEARCH", "säters stads", "saters stads"],
                        "SÖDERBÄRKE"       : ["söderbärke", "soderbarke"],
                        "TORSÅNG"          : ["torsång", "torsang"],
                        "TRANSTRAND"       : ["transtrand", "transtrand"],
                        "VENJAN"           : ["venjan", "venjan"],
                        "VIKA"             : ["vika", "vika"],
                        "VÅMHUS"           : ["våmhus", "vamhus"],
                        "ÄLVDALEN"         : ["älvdalen", "alvdalen","elfdalen"],
                        "ÄPPELBO"          : ["äppelbo", "appelbo"],
                        "ÅL"               : ["ål", "al"]
                    }

    # Norrbotten
    parish_dict_bd = {  
                        "ARJEPLOG"    : ["arjeplog", "mullholm"],
                        "ARVIDSJAUR"  : ["arvidsjaur", "arvidjaur", "malå", "mala"],
                        "GÄLLIVARE"   : ["gällivare", "gallivare", "gellivare", "gllivare", "hakkas"],
                        "JOKKMOKK"    : ["jokkmokk", "jokkmokk", "jockmock", "tjäruträsk", "vuollerim", "kvikkjokk"],
                        "ÖVERKALIX"   : ["överkalix", "overkalix"],
                        "NEDERKALIX"  : ["nederkalix", "månsbyn", "mansbyn", "töre", "tore", "kalix"], 
                        "ÖVERTORNEÅ"  : ["övertorneå", "overtornea", "kuivakangas", "torneå", "tornea"],
                        "HIETANIEMI"  : ["hietaniemi", "hietaniemi", "hietamiemi"],
                        "PAJALA"      : ["pajala", "pajala", "kirnujärvi", "kirnujarvi", 
                                         "tärendö", "tarendo", "junosuando", "junosuando"],
                        "GÄLLIVARE"   : ["gällivare", "gallivare"],
                        "ÄLVSBY"      : ["älvsby", "alvsby", "elfsby", "lvsby"],
                        "LULEÅ STADS" : ["luleå", "lulea", "örnäset", "ornaset", "luleå stads", "lulea stads"],
                        "LULEÅ"       : ["boden", "boden", "alträsk", "nederluleå", "nederlulea",
                                        "överluleå", "overlulea","edefors", "edefors"],
                        "RÅNEÅ"       : ["råneå", "ranea","rne"],
                        "PITEÅ LANDS" : ["piteå", "pitea", "pite", "sjulsmark","hortlax","norrfjärden", "norrfjarden"],
                        "PITEÅ STADS" : ["piteå stads", "pitea stads"],
                        "NEDERTORNEÅ" : ["nedertorneå", "nedertornea", 'nedertorneå', "nedertorenå", "nedertoreneå", "nikkala", "keräsjoki"],
                        "NEDERTORNEÅ-HAPARANDA" : ["haparanda", "haparanda"],
                        "KARL GUSTAV (BD-län)" : ["karl", "gustav", "gustaff", "carl gustaf", "lappträsk", "karungi"],
                        "JUKKASJÄRVI" : ["jukkasjärvi", "jukkasjarvi"],
                        "KARESUANDO"  : ["karesuando", "karesuando"],
                     }    

    # Vasterbotten
    parish_dict_ac = {
                        "ARJEPLOG"   : ["arjeplog", "arjeplog", "mullholm", "silbojokk"],
                        "ARVIDSJAUR" : ["arvidsjaur", "arvidjaur", "malå", "mala"],
                        "BJURHOLM"   : ["bjurholm", "bjurholm"],
                        "BURTRÄSK"   : ["burträsk", "burtrask", "burtrsk", "kalvträsk", "kalvtrask"],
                        "BYGDEÅ"     : ["bygdeå", "bygdea", "bygde"],
                        "DOROTEA"    : ["dorotea","dorotea", "dorothea"],
    #                   "DEGEFORS"   : ["degerfors", "degerfors"],
                        "FREDRIKA"   : ["frederika", "frederika", "fredrika"],
                        "HOLMÖN"     : ["holmön", "holmon", "holmn"],
    #                   "HÖRNEFORS"  : [, "hrnefors"],
                        "LYCKSELE"   : ["lycksele", "lycksele", "örträsk", "ortrask"],
                        "LÖVÅNGER"   : ["lövånger", "lovanger", "lövanger"],
                        "NORDMALING" : ["nordmaling", "nordmaling", "normaling"],
                        "NORSJÖ"     : ["norsjö", "norsjo", "norsiön", "jörns", "jorn"],
                        "NYSÄTRA (AC-län)" : ["nysätra", "nysatra"],
                        "ROBERTSFORS": ["robertsfors", "robertsfors"],
                        "SKELLEFTEÅ" : ["skellefteå", "skelleftea", "skeleftea", "byske", "ytterstfors"],
                        "SÄVAR"      : ["sävar", "savar", "svar"],
                        "SORSELE"    : ["sorsele", "sorsele"],
                        "STENSELE"   : ["stensele", "stensele"],
                        "TÄRNA"      : ["tärna", "tarna"],
                        "UMEÅ LANDS" : ["umeå", "umea", "homlsund", "hörnefors", "hornefors", "vännäs", "vannas", "wännäs"],
                        "UMEÅ STADS" : ["umeå stads", "umea stads"],
  #                     "VÄNNÄS"     : [],
                        "VILHELMINA" : ["vilhelmina", "vilhelmina"],
                        "VINDELN"    : ["vindeln", "vindeln","degerfors"],
                        "ÅSELE"      : ["åsele", "asele", "åmsele"],
                     }
    
    # Jämtland
    parish_dict_z = {   
                        "ÅRE"           : ["åre", "are", "kläppen"],
                        "ÅS (Z-län)"    : ["ås", "as"],
                        "NÄS (Z-län)"   : ["näs ", "nas"],
                        "LIT"           : ["lit ", "lit"],
                        "ALANÄS"        : ["alanäs", "alanas", "alans"],
                        "ALSEN"         : ["alsen", "alsen"],
                        "ASPÅS"         : ["aspås", "aspas"],
                        "BERG (Z-län)"  : ["berg", "berg"],
                        "BODSJÖ"        : ["bodsjö", "bodsjo"],
                        "BORGVATTNET"   : ["borgvattnet", "borgvattnet"],
                        "BRUNFLO"       : ["brunflo", "brunflo"],
                        "BRÄCKE"        : ["bräcke", "bracke"],
                        "FORS (Z-län)"  : ["fors", "fors"],
                        "FRÖSÖ"         : ["frösö", "froso"],
                        "FÖLLINGE"      : ["föllinge", "follinge", "folinge", "folling", "hotagen", 
                                           "frostviken", "frostviken", "laxsjö", "laxsjo"],
                        "HACKÅS"        : ["hackås", "hackas", "gillhov", "gillhof"],
                        "HALLEN"        : ["hallen", "hallen", "sundsbacken"],
                        "HAMMERDAL"     : ["hammerdal", "hamerdal","gåxsjö", "gaxsjo"], #
                        "HEDE (Z-län)"  : ["hede", "hede"],
                        "HÄGGENÅS"      : ["häggenås", "haggenas", "hggens", "hagenas"],
                        "HÄLLESJÖ"      : ["hällesjö", "hallesjo", "hllesj", "halesjo"],
                        "HÅSJÖ"         : ["håsjö", "hasjo", "hsj"],
                        "KALL"          : ["kall", "kall"],
                        "KLÖVSJÖ"       : ["klövsjö","klovsjo"],
                        "KYRKÅS (Z-län)": ["kyrkås", "kyrkas", "östersund", "ostersund"],
                        "LILLHÄRDAL"    : ["lillhärdal", "lillhardal", "lillhrdal", "lilhrdal"],
                        "LINSELL"       : ["linsell", "linsell"],
                        "LJUSNEDAL"     : ["ljusnedal", "ljusnedal"],
                        "LOCKNE"        : ["lockne", "lockne"],
                        "MARBY"         : ["marby", "marby"],
                        "MARIEBY"       : ["marieby", "marieby"],
                        "MATTMAR"       : ["mattmar", "mattmar", "matmar"],
                        "MYSSJÖ"        : ["myssjö", "myssjo", "myssj"],
                        "MÖRSIL"        : ["mörsil", "morsil"],
                        "NORDERÖ"       : ["norderö", "nordero", "norder"],
                        "NÄSKOTT"       : ["näskott", "naskott", "nskott"],
                        "OFFERDAL"      : ["offerdal", "offerdal"],
                        "OVIKEN"        : ["oviken", "oviken"],
                        "RAGUNDA"       : ["ragunda", "ragunda"],
                        "REVSUND"       : ["revsund", "revsund"],
                        "RÄTAN"         : ["rätan", "ratan"],
                        "RÖDÖN"         : ["rödön", "rodon"],
                        "STORSJÖ"       : ["storsjö", "storsjo", "storsj"],
                        "STRÖM"         : ["ström", "strom"], 
                        "STUGUN"        : ["stugun", "stugun"],
                        "SUNDSJÖ"       : ["sundsjö", "sundsjo"],
                        "SUNNE (Z-län)" : ["sunne", "sunne"],
                        "SVEG"          : ["sveg", "sveg"],
                        "TÅSJÖ"         : ["tåsjö", "tasjo"], #
                        "TÄNNÄS"        : ["tännäs", "tannas", "tanas"],
                        "UNDERSÅKER"    : ["undersåker", "undersaker"],
                        "VEMDALEN"      : ["vemdalen", "vemdalen"],
                        "ÄLVROS"        : ["älvros", "alvros"] ,
                        "ÅSARNE"        : ["åsarne", "asarne"],
                        "ÖVERHOGDAL"    : ["överhogdal", "overhogdal", "verhogdal"],
                        "YTTERHOGDAL"   : ["ytterhogdal", "ytterhogdal"]
                     }
    
    if parish_code == "bd":
        parish_dict = parish_dict_bd
    if parish_code == "ac":
        parish_dict = parish_dict_ac
    if parish_code == "z":
        parish_dict = parish_dict_z
    if parish_code == "y":
        parish_dict = parish_dict_y
    if parish_code == "x":
        parish_dict = parish_dict_x
    if parish_code == "w":
        parish_dict = parish_dict_w
    
    
    return(parish_dict)
    
    

In [13]:
data = parish_data_selecter("bd", full_data)
parish_dict = parish_selecter("bd")

Unnamed: 0,fsid,ns_birthplace,word_search,ls_search,ls_ratio,jw_search,jw_ratio
0,218J-6M4,nederluleå norrbotten sweden,Not identified,Not identified,,Not identified,
1,218J-F67,nederkalix norrbotten sweden,Not identified,Not identified,,Not identified,
2,218J-XL6,nordinga norrbotten sweden,Not identified,Not identified,,Not identified,
3,218L-YYQ,kalix norrbotten sweden,Not identified,Not identified,,Not identified,
4,218V-7DH,arnäs norrbotten sweden,Not identified,Not identified,,Not identified,


(4363, 7)


In [14]:
#data_['word_search'] = "Not identified"
data = word_parser2(data, parish_dict)


In [15]:
data = edit_dist_parser(data, parish_dict, ls_thresh=0.9,jw_thresh=0.9)


In [16]:
stats(data_parish=data)


Observations: 4363 

[1mword_search[0m
# Identified: 2867
# Not Identified: 1496
% Identified: 0.657 

[1mls_search[0m
# Identified: 3100
# Not Identified: 1263
% Identified: 0.711 

[1mjw_search[0m
# Identified: 3211
# Not Identified: 1152
% Identified: 0.736 



In [17]:
#data_w.sample(10)
data[data['jw_search'] == "Not identified"].sample(10)
# relplace kopparberg with Dalarna


Unnamed: 0,fsid,ns_birthplace,word_search,ls_search,ls_ratio,jw_search,jw_ratio
413,94ZJ-L2D,sangis norrbotten sweden,Not identified,Not identified,,Not identified,
788,G3GK-L5W,svartbäcken norrbotten sweden,Not identified,Not identified,,Not identified,
624,9NWV-2ZN,sjulnäs norrbotten sweden,Not identified,Not identified,,Not identified,
113,27FF-GY4,sävast norrbotten sweden,Not identified,Not identified,,Not identified,
86,27BR-JKG,erkheikki norrbotten sweden,Not identified,Not identified,,Not identified,
651,9V3V-G9L,vittangi norrbotten sverige,Not identified,Not identified,,Not identified,
4331,MWGY-MXL,miekojärvi norrbotten svezia sweden,Not identified,Not identified,,Not identified,
4,218V-7DH,arnäs norrbotten sweden,Not identified,Not identified,,Not identified,
674,9VCQ-G22,fanbyn stöde vastnorrland sweden,Not identified,Not identified,,Not identified,
3763,LT18-83V,roknäs 1 norrbotten sweden,Not identified,Not identified,,Not identified,


In [18]:
result_list(data).sort_values(by="String-Search", ascending=False)

Unnamed: 0,String-Search,LS Dist,JW Dist
Not identified,1496,1263,1152
PITEÅ LANDS,696,705,724
LULEÅ,400,447,467
NEDERKALIX,278,284,290
ARVIDSJAUR,210,215,220
RÅNEÅ,165,169,179
PAJALA,151,156,159
ÖVERTORNEÅ,151,158,162
ÖVERKALIX,119,145,150
LULEÅ STADS,117,115,124


In [19]:
#data.to_csv("./data/ac/data_ac.csv", index=False)
