In [1]:
import pandas as pd
import unidecode

# !pip install fuzzywuzzy
from fuzzywuzzy import fuzz
import re
import unidecode





## 1. Merge past donors dataset (K4K sources) with donor organizations (PDF source)
We will use the company name in both sources to try and match companies. We will use fuzzy matching so we can detect cases with similar strings (its likely names will have different versions in the sources)

In [81]:
df_donors = pd.read_pickle('../Data/past_donors_clean.pickle')
df_orgs = pd.read_excel('../Data/Organizations_stg.xlsx')

In [2]:
def clean_col_names(df_):

    df = df_.copy()
    cols = df.columns.to_list()
    new_cols=[]
    for col in cols:
        
        col = re.sub(' +', ' ', col)
        col = col.replace(' ','_')
        col = col.replace('/','')
        col = col.replace('$','dollar')
        col = col.upper()
        new_cols.append(col)

    df.columns = new_cols

    return df

In [83]:
df_donors = clean_col_names(df_donors)
df_orgs = clean_col_names(df_orgs)

In [3]:

def stdr_names(series_original):
    '''Clean company names. To be used in the different files so names are likelier to be matched'''
    series = series_original.copy()
    series = series.astype(str)
    
    series = series.str.upper()
    series = series.str.strip()

    series = series.replace(r'\s+', ' ', regex=True)
    series = series.str.replace(r'[^\w\s]+', '', regex=True)


    series = series.apply(lambda x: unidecode.unidecode(x))

    series = series.str.replace(' QUEBEC ', '')
    series = series.str.replace('CANADA', '')
    series = series.str.replace(' MONTREAL ', '')
    series = series.str.replace('MONTREAL ', '')
    series = series.str.replace(' MONTREAL', '')
    series = series.str.replace(' INC', '')
    series = series.str.replace(' INC ', '')
    series = series.str.replace(' CIE ', '')
    series = series.str.replace(' CIE', '')
    series = series.str.replace(' LTEE', '')
    series = series.str.replace('CORPORATION', '')
    series = series.str.replace('INTERNATIONAL', '')


    return series


In [86]:
df_donors = df_donors[ df_donors.COMPANY.notna() ]

In [87]:
df_orgs.NAME = stdr_names(df_orgs.NAME)

df_donors.COMPANY = df_donors.COMPANY.astype(str)
df_donors.COMPANY  = stdr_names(df_donors.COMPANY)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [88]:
pd.merge(df_orgs.NAME.drop_duplicates(), df_donors.COMPANY.drop_duplicates(), left_on='NAME', right_on='COMPANY')

# 18 matches by exact matching

Unnamed: 0,NAME,COMPANY
0,ARCHAMBAULT,ARCHAMBAULT
1,AVERNA,AVERNA
2,CANADIAN TIRE,CANADIAN TIRE
3,CENTRES DENTAIRES LAPOINTE,CENTRES DENTAIRES LAPOINTE
4,CGI,CGI
5,CORBEC,CORBEC
6,DORFIN,DORFIN
7,ENERGIE CARDIO,ENERGIE CARDIO
8,FIERA CAPITAL,FIERA CAPITAL
9,FUTURE SHOP,FUTURE SHOP


In [10]:
# https://www.datacamp.com/tutorial/fuzzy-string-python


# this for-loop calculates 4 kinds of similarity score, which we will then use to sort the table and manually review the most similar matches to confirm (based on K4K feedback)

def match_comp_names(orgs, donors):

    # orgs = orgs.NAME.drop_duplicates().to_list()
    # donors = donors.company.drop_duplicates().to_list()

    d = 0
    j = 0
    rows = []

    for org in orgs:
        for donor in donors:
            Str1 = org
            Str2 = donor
            Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
            Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
            Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
            Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)

            row = [org, donor, Ratio, Partial_Ratio, Token_Sort_Ratio, Token_Set_Ratio]

            rows.append(row)
    
    return rows

In [11]:
rows = match_comp_names(df_orgs.NAME.drop_duplicates().to_list(), df_donors.COMPANY.drop_duplicates().to_list())
df_sim = pd.DataFrame(rows, columns = ['org', 'donor', 'Ratio', 'Partial_Ratio', 'Token_Sort_Ratio', 'Token_Set_Ratio'] )
df_sim.to_pickle('similarity_score.pkl')


KeyboardInterrupt: 

In [10]:
score = 'Ratio'
df_ratios = df_sim.groupby('org', as_index=False)[score].max()
#df_ratios.merge(df_sim[['org', 'donor', score]], how = 'inner').sort_values(score, ascending = False).head(30)

In [11]:
score = 'Token_Set_Ratio'

df_ratios = df_sim.groupby('donor', as_index=False)[score].max()
#df_token_set_ratio = df_ratios.merge(df_sim[['org', 'donor', score]], how = 'inner').sort_values(score, ascending = False)

In [12]:
score = 'Partial_Ratio'
#df_ratios = df_sim.groupby('org', as_index=False)[score].max()
#df_ratios.merge(df_sim[['org', 'donor', score]], how = 'inner').sort_values(score, ascending = False).head(20)

In [13]:


score = 'Token_Sort_Ratio'

df_ratios = df_sim.groupby('org', as_index=False)[score].max()
df_ratios.merge(df_sim[['org', 'donor', score]], how = 'inner').sort_values(score, ascending = False).head(20)

Unnamed: 0,org,Token_Sort_Ratio,donor
1317,KRUGER,100,KRUGER
846,CORBEC,100,CORBEC
1772,ROYAL LEPAGE,100,ROYAL LEPAGE
1329,LABORATOIRES CHARLES RIVER,100,LABORATOIRES CHARLES RIVER
881,DELMAR,100,DELMAR
1061,FUTURE SHOP,100,FUTURE SHOP
1278,JEAN COUTU,100,JEAN COUTU
613,BDO,100,BDO
733,CANADIAN TIRE,100,CANADIAN TIRE
1832,SNC LAVALIN,100,SNC LAVALIN


In [14]:
# we sort the values and export them to CSV so we can review in excel

th = 70
df_filt = df_sim.query(f''' Ratio > {th} |  Token_Sort_Ratio > {th}  | Token_Set_Ratio > {th}  ''' ).reset_index(drop=True)
df_filt.shape
df_filt.sort_values(['Token_Set_Ratio', 'Ratio'], ascending=False).to_clipboard(index=None)

## 2. Confirm matches based on reviewed excel
After manually confirming the fuzzy matching scores, we will produces the final merged dataset

In [4]:
import pandas as pd
import unidecode
import re

def clean_col_names(df_):

    df = df_.copy()
    cols = df.columns.to_list()
    new_cols=[]
    for col in cols:
        
        col = re.sub(' +', ' ', col)
        col = col.replace(' ','_')
        col = col.replace('/','')
        col = col.replace('$','dollar')
        col = col.upper()
        new_cols.append(col)

    df.columns = new_cols

    return df
    
def stdr_names(series_original):
    '''Clean company names. To be used in the different files so names are likelier to be matched'''
    series = series_original.copy()
    series = series.astype(str)
    
    series = series.str.upper()
    series = series.str.strip()

    series = series.replace(r'\s+', ' ', regex=True)
    series = series.str.replace(r'[^\w\s]+', '', regex=True)


    series = series.apply(lambda x: unidecode.unidecode(x))

    series = series.str.replace(' QUEBEC ', '')
    series = series.str.replace('CANADA', '')
    series = series.str.replace(' MONTREAL ', '')
    series = series.str.replace('MONTREAL ', '')
    series = series.str.replace(' MONTREAL', '')
    series = series.str.replace(' INC', '')
    series = series.str.replace(' INC ', '')
    series = series.str.replace(' CIE ', '')
    series = series.str.replace(' CIE', '')
    series = series.str.replace(' LTEE', '')
    series = series.str.replace('CORPORATION', '')
    series = series.str.replace('INTERNATIONAL', '')


    return series


In [5]:
df_donors = pd.read_csv('../Data/past_donors_clean.csv', parse_dates=[1])
df_orgs = pd.read_excel('../Data/Organizations_stg.xlsx')

df_donors=clean_col_names(df_donors)
df_orgs=clean_col_names(df_orgs)

# now we read the matches after reviews them and this will become our key to match both sources
df_matches = pd.read_excel('../Data/matched_orgs_k4kreview.xlsx')

df_donors.COMPANY = stdr_names(df_donors.COMPANY)
df_orgs.NAME = stdr_names(df_orgs.NAME)

In [6]:
df_matches = df_matches[df_matches.Match == 1] # keeping only matches
df_matches = df_matches.iloc[:, 0:2]
df_matches.columns = ['ORGANIZATION', 'DONOR']

In [7]:
# some past donors have been matched with more than 1 different company in the PDF source, so we need to select and keep only one so the analysis is consistent.
donor_q = df_matches.DONOR.value_counts()
organization_q = df_matches.ORGANIZATION.value_counts()
df1 = df_matches[df_matches.DONOR.isin(donor_q[donor_q > 1].index)].sort_values('DONOR').reset_index(drop=True)

In [8]:

ixs = [1, 2,5,6,10,11,15,16,18, 21,22] 
df1 = df1.iloc[ixs,:].reset_index(drop=True)

In [9]:
# now we concatenate the matches into a single df
df_matches = pd.concat([ df_matches[df_matches.DONOR.isin(donor_q[donor_q <= 1].index)].sort_values('DONOR').reset_index(drop=True),
            df1 ] )

In [10]:
df_donors_match = df_donors.merge(df_matches, how = 'inner', left_on = 'COMPANY', right_on = 'DONOR')

In [11]:
df_donors_match = df_donors_match.merge(df_orgs, how = 'inner', left_on = 'ORGANIZATION', right_on = 'NAME')

In [12]:
df_donors_match = df_donors_match.drop_duplicates().reset_index(drop=True)


In [13]:
df_donors_match.shape

# after matching past donors and organizations PDF, we have 177 rows of data and 32 columns. We should add additiional attributes of the companies that have foundations

(177, 60)

## 3. Adding foundation attributes

In [14]:
df_fonds = pd.read_excel('../Data/Foundations_stg.xlsx')

df_fonds = clean_col_names(df_fonds)

## add foundations dataset attributes

In [15]:
df = df_donors_match.merge(df_fonds, how='left', on='ID', suffixes=('', '_FND'))
df.shape

(177, 85)

obviously, most companies will have null values in their foundation attributes, but we can still keep the info for the ones that do

In [18]:
df.to_pickle('../Data/df_matches.pickle')

In [17]:
df.columns

Index(['COMPANY', 'ADDED_MODIFIED', 'DOLLAR_AMOUNT_FLOAT',
       'MONEYPRIZE_RECEIVED', 'DONATION_DETAILS', 'YEAR', 'DON_DETAIL_AMOUNT',
       'DOLLAR_EQUIVALENT_AMOUNT', 'DON_DETAIL_TXT', 'ORGANIZATION', 'DONOR',
       'ID', '2E_CONTACT_POUR', 'ADDRESS', 'AVIS', 'CONTACT', 'CONTRIBUTION',
       'COURRIEL', 'DDD', 'DOMAINE_DINTERET', 'FAF', 'FAX', 'FILIALE_DE',
       'ISFOUNDATION', 'LANGUE', 'LIMITES_GEOG', 'N_DE_TEL', 'NAME',
       'NBRE_DE_SUCC', 'NOMBRE_DEMPLOYES', 'NOTE', 'POSTE', 'PRINCIP_FILIALES',
       'SECTEUR_INDUSTRIEL', 'SITE_WEB', 'MUNICIPALITY', 'PROVINCE',
       'POSTALCODE', 'STREET', 'SECTOR_FABRICATION', 'SECTOR_FINANCIERS',
       'SECTOR_COMMERCE', 'SECTOR_CONSTRUCTION', 'SECTOR_TRANSPORT',
       'SECTOR_GESTION', 'SECTOR_SYSTEMS', 'SECTOR_ALIMENTATION',
       'SECTOR_SANTE_ASSURANCE', 'SECTOR_OTHER', 'GENERAL_CHARITY',
       'HEALTH_ARTS_CULTURE', 'COMMUNITY_ENVIRONMENTAL',
       'YOUTH_HEALTH_EDUCATION', 'REGIONAL_SPECIFIC',
       'CANCER_DISEASE_RES