In [2]:
from ast import literal_eval
import numpy as np
import math
import re
import pandas as pd

import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

from difflib import SequenceMatcher
from collections import Counter
from itertools import groupby

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from fuzzywuzzy import fuzz

pd.set_option('display.max_columns', 1000)

In [12]:
#read in data
# df = pd.read_csv('bo_attr_lookup_data_new.csv', encoding='latin1')
df = pd.read_csv('bo_attr_lookup_new.csv', encoding='latin1')

#create spare child col
df['children_org_name'] = df['children_org_name_list'].agg(list)

#conver child col to list
df['children_org_name_list'] = df['children_org_name_list'].agg(list)

#fill missing values
df = df.fillna('missingmissingmissingmissingmissing')

#trigram function
def split_str(s, N):
    rmax = np.clip(len(s)-N, a_min=0, a_max=None)
    return [s[0+i:N+i] for i in range(0, rmax+1)]

#trigrams
df['parent_trigram'] = df['parent_org_name'].apply(lambda x: split_str(x, 3))
df['child_trigram'] = df['children_org_name_list'].apply(lambda x: split_str(x, 3))

#convert to list
df['children_org_name_list'] = df.children_org_name_list.map(lambda x: [x])

# #jaro score
# df['jaro_func_score'] = df[['parent_org_name', 'children_org_name_list']].agg(lambda x: get_top_matches(*x), axis=1)
# df['jaro_func_score'] = df.jaro_func_score.apply(lambda x: [val[1] for val in x])
# df['jaro_func_score'] = df.jaro_func_score.apply(lambda x: ', '.join([str(i) for i in x])).astype(float)

# #parent to child
# df['jaro_func_score_trigram_parent'] = df[['parent_org_name', 'child_trigram']].agg(lambda x: get_top_matches(*x), axis=1)
# df['jaro_func_score_trigram_parent'] = df.jaro_func_score_trigram_parent.apply(lambda x: [val[1] for val in x])

# #child to parent
# df['jaro_func_score_trigram_child'] = df[['children_org_name', 'parent_trigram']].agg(lambda x: get_top_matches(*x), axis=1)
# df['jaro_func_score_trigram_child'] = df.jaro_func_score_trigram_child.apply(lambda x: [val[1] for val in x])

df.head(5)

Unnamed: 0,parent_org_name,children_org_name_list,match_confidence_score,match_confidence,children_org_name,parent_trigram,child_trigram
0,"""NEXARK INC""",[NEWKIRK PRODUCTS],0.533333,LOW,NEWKIRK PRODUCTS,"[""NE, NEX, EXA, XAR, ARK, RK , K I, IN, INC, ...","[NEW, EWK, WKI, KIR, IRK, RK , K P, PR, PRO, ..."
1,+DESCONOCIDO,[DEGENKOLB],0.666667,LOW,DEGENKOLB,"[+DE, DES, ESC, SCO, CON, ONO, NOC, OCI, CID, ...","[DEG, EGE, GEN, ENK, NKO, KOL, OLB]"
2,+EMBOL LTDA,[EMPLOYBRIDGE],0.6,LOW,EMPLOYBRIDGE,"[+EM, EMB, MBO, BOL, OL , L L, LT, LTD, TDA]","[EMP, MPL, PLO, LOY, OYB, YBR, BRI, RID, IDG, ..."
3,+MARTINS,[SAN MARINA],0.75,MEDIUM,SAN MARINA,"[+MA, MAR, ART, RTI, TIN, INS]","[SAN, AN , N M, MA, MAR, ARI, RIN, INA]"
4,+RED COMUNICACAO & MARKETING LTDA,[RED ENVELOPE ENTERTAINMENT],0.625,LOW,RED ENVELOPE ENTERTAINMENT,"[+RE, RED, ED , D C, CO, COM, OMU, MUN, UNI, ...","[RED, ED , D E, EN, ENV, NVE, VEL, ELO, LOP, ..."


In [13]:
all_names = df['children_org_name'].unique()
names_freq = Counter()
for name in all_names:
    names_freq.update(str(name).split(" "))
key_words = [word for (word,_) in names_freq.most_common(30)]
print(len(all_names))
print(key_words)

9110
['OF', 'UNIVERSITY', 'COLLEGE', '&', 'GROUP', 'SERVICES', 'AND', 'DE', 'DOI', 'CORPORATION', 'HEALTH', 'COMMUNITY', 'TECHNOLOGY', 'THE', 'INSURANCE', 'MEDIA', 'INTERNATIONAL', 'INSTITUTE', 'STATE', 'COMPANY', 'SYSTEMS', 'ENERGY', 'NATIONAL', 'US', 'CENTER', 'TECHNOLOGIES', 'BANK', 'HOLDINGS', 'FINANCIAL', 'MEDICAL']


In [14]:
all_main_name = pd.DataFrame(columns=['sort_gp','names','alias','score'])
all_names.sort()
all_main_name['names'] = all_names
all_main_name['sort_gp'] = all_main_name['names'].apply(lambda x: x[0])
all_main_name

Unnamed: 0,sort_gp,names,alias,score
0,1,1&1 IONOS,,
1,1,1-800 SERVICE PARTNERS,,
2,1,171263 CANADA,,
3,2,20 MINUTES,,
4,2,21ST CENTURY CASUALTY COMPANY,,
...,...,...,...,...
9105,Z,ZONEPERFECT,,
9106,Z,ZOOMBAK,,
9107,Z,ZURICH SEGUROS,,
9108,Z,ZWECKVERBAND KDN - DACHVERBAND KOMMUNALER IT-D...,,


In [22]:
all_sort_gp = all_main_name['sort_gp'].unique()

def no_key_word(name):
    """check if the name contain the keywords in travel company"""
    output = True
    for key in key_words:
        if key in name:
            output = False
    return output

for sortgp in all_sort_gp:
    this_gp = all_main_name.groupby(['sort_gp']).get_group(sortgp)
    gp_start = this_gp.index.min()
    gp_end = this_gp.index.max()
    for i in range(gp_start,gp_end+1):
    
        # if self has not got alias, asign to be alias of itself
        if pd.isna(all_main_name['alias'].iloc[i]):
            all_main_name['alias'].iloc[i] = all_main_name['names'].iloc[i]
            all_main_name['score'].iloc[i] = 100
        
        # if the following has not got alias and fuzzy match, asign to be alias of this one
        for j in range(i+1,gp_end+1):
            if pd.isna(all_main_name['alias'].iloc[j]):
                fuzz_socre = fuzz.token_sort_ratio(all_main_name['names'].iloc[i],all_main_name['names'].iloc[j])
                if not no_key_word(all_main_name['names'].iloc[j]):
                    fuzz_socre -= 10
                if (fuzz_socre > 85):
                    all_main_name['alias'].iloc[j] = all_main_name['alias'].iloc[i]
                    all_main_name['score'].iloc[j] = fuzz_socre
                    
        if i % (len(all_names)//10) == 0:
            print("progress: %.2f" % (100*i/len(all_names)) + "%")
                
all_main_name.to_csv('org_name_results_fuzzywuzzy.csv')

progress: 0.00%
progress: 10.00%
progress: 20.00%
progress: 30.00%
progress: 40.00%
progress: 50.00%
progress: 60.00%
progress: 70.00%
progress: 80.00%
progress: 90.00%


In [24]:
all_main_name[(all_main_name['names']!=all_main_name['alias']) & (all_main_name['alias'].notna())]

Unnamed: 0,sort_gp,names,alias,score
185,A,AFD INCORPORATED,ACS INCORPORATED,88
280,A,ALD AUTOMOTIVE,ADT AUTOMOTIVE,93
504,A,APHC INCORPORATED,ACS INCORPORATED,91
551,A,ARD INCORPORATED,ACS INCORPORATED,88
554,A,ARENA PHARMACEUTICALS,ANACOR PHARMACEUTICALS,88
583,A,ARROW PHARMACEUTICALS,ARAGON PHARMACEUTICALS,88
636,A,ASPECTA VERSICHERUNG,ASPECTA LEBENSVERSICHERUNG,87
746,A,AVEO PHARMACEUTICALS,ANACOR PHARMACEUTICALS,86
1212,B,BT AMERICAS,BOAT AMERICA,87
1756,C,COGEMAG,COGEMA,92


In [6]:
#jaro version
def sort_token_alphabetically(word):
    token = re.split('[,. ]', word)
    sorted_token = sorted(token)
    return ' '.join(sorted_token)

def get_jaro_distance(first, second, winkler=True, winkler_ajustment=True,
                      scaling=0.1, sort_tokens=True):
    """
    :param first: word to calculate distance for
    :param second: word to calculate distance with
    :param winkler: same as winkler_ajustment
    :param winkler_ajustment: add an adjustment factor to the Jaro of the distance
    :param scaling: scaling factor for the Winkler adjustment
    :return: Jaro distance adjusted (or not)
    """
    if sort_tokens:
        first = sort_token_alphabetically(first)
        second = sort_token_alphabetically(second)

    if not first or not second:
        raise JaroDistanceException(
            "Cannot calculate distance from NoneType ({0}, {1})".format(
                first.__class__.__name__,
                second.__class__.__name__))

    jaro = _score(first, second)
    cl = min(len(_get_prefix(first, second)), 4)

    if all([winkler, winkler_ajustment]):  # 0.1 as scaling factor
        return round((jaro + (scaling * cl * (1.0 - jaro))) * 100.0) / 100.0

    return jaro

def _score(first, second):
    shorter, longer = first.lower(), second.lower()

    if len(first) > len(second):
        longer, shorter = shorter, longer

    m1 = _get_matching_characters(shorter, longer)
    m2 = _get_matching_characters(longer, shorter)

    if len(m1) == 0 or len(m2) == 0:
        return 0.0

    return (float(len(m1)) / len(shorter) +
            float(len(m2)) / len(longer) +
            float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0

def _get_diff_index(first, second):
    if first == second:
        pass

    if not first or not second:
        return 0

    max_len = min(len(first), len(second))
    for i in range(0, max_len):
        if not first[i] == second[i]:
            return i

    return max_len

def _get_prefix(first, second):
    if not first or not second:
        return ""

    index = _get_diff_index(first, second)
    if index == -1:
        return first

    elif index == 0:
        return ""

    else:
        return first[0:index]

def _get_matching_characters(first, second):
    common = []
    limit = math.floor(min(len(first), len(second)) / 2)

    for i, l in enumerate(first):
        left, right = int(max(0, i - limit)), int(
            min(i + limit + 1, len(second)))
        if l in second[left:right]:
            common.append(l)
            second = second[0:second.index(l)] + '*' + second[
                                                       second.index(l) + 1:]

    return ''.join(common)

def _transpositions(first, second):
    return math.floor(
        len([(f, s) for f, s in zip(first, second) if not f == s]) / 2.0)

def get_top_matches(reference, value_list, max_results=None):
    scores = []
    if not max_results:
        max_results = len(value_list)
    for val in value_list:
        score_sorted = get_jaro_distance(reference, val)
        score_unsorted = get_jaro_distance(reference, val, sort_tokens=False)
        scores.append((val, max(score_sorted, score_unsorted)))
    scores.sort(key=lambda x: x[1], reverse=True)

    return scores[:max_results]

class JaroDistanceException(Exception):
    def __init__(self, message):
        super(Exception, self).__init__(message)

In [19]:
def clean_special_characters(txt):
    seps = [" ",";",":",".",",","*","#","@","|","/","\\","-","_","?","%","!","^","(",")"]
    default_sep = seps[0]

def clean_stopword(txt):
    temp_list = txt.split(" ")
    temp_list = [i for i in temp_list if i not in stopwords]
    return " ".join(temp_list)

def data_cleaning(data, nameCol='children_org_name', dropForeign = True):
    data.dropna(subset=[nameCol], inplace=True)
    data = data.rename_axis('CompanyID').reset_index()
    data['nonAscii_count'] = data[nameCol].apply(lambda x: sum([not c.isascii() for c in x]))
    if dropForeign:
        data = data[data.nonAscii_count==0]
    else:
        pass
    data.drop('nonAscii_count', axis=1, inplace=True)
    data_clean = data.copy()
    data_clean['CompanyName_clean'] = data_clean[nameCol].apply(lambda x: x.lower())
    data_clean['CompanyName_clean'] = data_clean['CompanyName_clean'].apply(clean_special_characters)
    data_clean['CompanyName_clean'] = data_clean['CompanyName_clean'].apply(clean_stopword)
    return data_clean


In [20]:
def fuzz_similarity(comp_names):
    similarity_array = np.ones((len(comp_names), (len(comp_names))))*100
    
    for i in range(1,len(comp_names)):
        for j in range(i):
            s1 = fuzz.token_set_ratio(comp_names[i],comp_names[j]) + 0.000000000001
            s2 = fuzz.partial_ratio(comp_names[i],comp_names[j]) + 0.000000000001
            similarity_array[i][j] = 2*s1*s2 / (s1+s2)
    for i in range(len(comp_names)):
        for j in range(i+1,len(comp_names)):
            similarity_array[i][j] = similarity_array[j][i]
            
    np.fill_diagonal(similarity_array, 100)
    return similarity_array

def company_clusters(data, nameCol='children_org_name', dropForeign=True):
    data_clean = data_cleaning(data, nameCol=nameCol, dropForeign=dropForeign)
    comp_names = data_clean.CompanyName_clean.to_list()
    cust_ids = data_clean.CompanyID.to_list()
    
    similarity_array = fuzz_similarity(comp_names)
    clusters = cluster.AffinityPropagation(affinity='precomputed').fit_predict(similarity_array)
    df_clusters = pd.DataFrame(list(zip(cust_ids, clusters)), columns=['CompanyID','clusters'])
    
    df_eval = df_clusters.merge(data_clean, on='CompanyID',how='left')
    return df_eval

def standard_name(df_eval):
    d_standard_name = {}
    for cluster in df_eval.cluster.unique():
        names = df_eval[df_eval['cluster']==cluster].CompanyName_clean.to_list()
        l_common_substring = []
        if len(names)>1:
            for i in range(0,len(names)):
                for j in range(i+1,len(names)):
                    seqMatch = SequenceMatcher(None,names[i],names[j])
                    match = seqMatch.find_longest_match(0, len(names[i]), 0, len(names[j]))
                    if (match.size!=0):
                        l_common_substring.append(names[i][match.a: match.a + match.size].strip())
            n= len(l_common_substring)
            counts = Counter(l_common_substring)
            get_mode = dict(counts)
            mode = [k for k, v in get_mode.items() if v == max(list(counts.values()))]
            d_standard_name[cluster] = ";".join(mode)
            
        else:
            d_standard_name[cluster] = names[0]

    df_standard_names = pd.DataFrame(list(d_standard_name.items()), columns=['clusters','StandardName'])
    df_eval = df_eval.merge(df_standard_names, on='cluster', how = 'left')
    df_eval['standard_name_withoutSpaces'] = df_eval.StandardName.apply(lambda x: x.replace(" ",""))
    for name in df_eval.standard_name_withoutSpaces.unique():
        if len(df_eval[df_eval.standard_name_withoutSpaces==name].cluster.unique()) >1:
            df_eval.loc[df_eval.standard_name_withoutSpsaces==name,'StandardName'] = name
            
    return df_eval.drop('standard_name_withoutSpaces', axis=1)


In [21]:
standard_name(df)

AttributeError: 'DataFrame' object has no attribute 'cluster'