In [51]:
import pandas as pd
import re
import numpy as np
import gc
from fuzzywuzzy import fuzz

In [52]:
df_contribuion = pd.read_csv('data/contribution_processed.csv')
df_companies = pd.read_csv('data/ma-companies-on-linkedin.csv', sep = ';')

In [53]:
companies_list = df_companies['Company name'].unique()

In [54]:
employer_list = df_contribuion['Employer'].unique()

In [55]:
employer_list.shape

(17338,)

In [56]:
companies_list.shape

(67306,)

In [57]:
def clean_text(text):
    '''
    Remove special charecters, numbers
    Return lower case alphabet charecters only
    '''
    text = str(text).lower()
    text = re.sub('[^A-Za-z]+', '', text)
    return text

In [58]:
companies_list_clean = np.array([clean_text(x) for x in companies_list.copy()])

In [59]:
employer_list_clean = np.array([clean_text(x) for x in employer_list.copy()])

In [37]:
# MATCHING

In [81]:
match = []
companies_list_clean = companies_list_clean.astype('O')
employer_list_clean = employer_list_clean.astype('O')

for employer in employer_list_clean:
    for company in companies_list_clean:
        if fuzz.ratio(employer, company) >= 85:
            match.append(employer)

In [83]:
match_set = set(match)

In [85]:
len(match_set)

5337

In [87]:
match_set

{'obrienassociates',
 'fatherbillsmainspringhouse',
 'schawbeltechnologiesllc',
 'westfieldbank',
 'messingerinsurance',
 'bristolcountysavingsbank',
 'unitedmethodistchurch',
 'harvardbuisnessschool',
 'jrichsolutions',
 'bostonportfolioadvisers',
 'commwealthofmass',
 'finepointcapital',
 'bioenergyinternational',
 'corcoranandjennison',
 'realestate',
 'maaps',
 'aacconsulting',
 'vercpartnership',
 'bluehillcemetery',
 'bertucis',
 'masslegalassistancecorporation',
 'stbernadetteschool',
 'gbelectricalservices',
 'tarlowbreedhartrodgers',
 'lawrencecommunityworks',
 'confluentsurgical',
 'greaterhaverhillchamberofcommerce',
 'wayfair',
 'independentcontractor',
 'tremblaybus',
 'harvarduniversity',
 'nixonpeabody',
 'careerpoint',
 'prodevelopmentgroup',
 'borislowinsurance',
 'agency',
 'berkshirehealthsystems',
 'kevinpmartinassociates',
 'vesonnautical',
 'dearbornacademy',
 'haaphousing',
 'eastforkgroupllc',
 'brocktonneighborhoodhealthcenter',
 'uaspire',
 'takedapharmaceutic

In [88]:
df_contribuion['Employer_clean'] = df_contribuion['Employer'].apply(clean_text)

In [93]:
df_companies['Company_clean'] = df_companies['Company name'].apply(clean_text)

In [89]:
def company_search(name):
    return name in match_set

In [90]:
df_contribuion['Industry_new'] = df_contribuion['Employer_clean'].apply(company_search)

In [92]:
df_contribuion['Industry_new'].value_counts()

False    95910
True     22551
Name: Industry_new, dtype: int64

In [97]:
# New trial

In [98]:
match_set

{'obrienassociates',
 'fatherbillsmainspringhouse',
 'schawbeltechnologiesllc',
 'westfieldbank',
 'messingerinsurance',
 'bristolcountysavingsbank',
 'unitedmethodistchurch',
 'harvardbuisnessschool',
 'jrichsolutions',
 'bostonportfolioadvisers',
 'commwealthofmass',
 'finepointcapital',
 'bioenergyinternational',
 'corcoranandjennison',
 'realestate',
 'maaps',
 'aacconsulting',
 'vercpartnership',
 'bluehillcemetery',
 'bertucis',
 'masslegalassistancecorporation',
 'stbernadetteschool',
 'gbelectricalservices',
 'tarlowbreedhartrodgers',
 'lawrencecommunityworks',
 'confluentsurgical',
 'greaterhaverhillchamberofcommerce',
 'wayfair',
 'independentcontractor',
 'tremblaybus',
 'harvarduniversity',
 'nixonpeabody',
 'careerpoint',
 'prodevelopmentgroup',
 'borislowinsurance',
 'agency',
 'berkshirehealthsystems',
 'kevinpmartinassociates',
 'vesonnautical',
 'dearbornacademy',
 'haaphousing',
 'eastforkgroupllc',
 'brocktonneighborhoodhealthcenter',
 'uaspire',
 'takedapharmaceutic

In [102]:
len(match_set)

5337

In [116]:
match_ind = {}
for com in match_set:
    industry = df_companies[df_companies['Company_clean'] == com]['Industry'].values
    if len(industry) == 0:
        continue
    match_ind[com] = industry[0]

In [118]:
len(match_ind)

2088

In [119]:
match_ind

{'obrienassociates': 'financial services',
 'westfieldbank': 'banking',
 'bristolcountysavingsbank': 'banking',
 'bostonportfolioadvisers': 'investment management',
 'realestate': 'real estate',
 'bluehillcemetery': 'individual & family services',
 'confluentsurgical': 'medical devices',
 'greaterhaverhillchamberofcommerce': 'civic & social organization',
 'wayfair': 'internet',
 'independentcontractor': 'non-profit organization management',
 'harvarduniversity': 'higher education',
 'careerpoint': 'staffing and recruiting',
 'borislowinsurance': 'insurance',
 'agency': 'marketing and advertising',
 'berkshirehealthsystems': 'hospital & health care',
 'vesonnautical': 'computer software',
 'dearbornacademy': 'primary/secondary education',
 'brocktonneighborhoodhealthcenter': 'medical practice',
 'uaspire': 'higher education',
 'rismaninsurance': 'insurance',
 'projectrightinc': 'individual & family services',
 'bostonseniorhomecare': 'hospital & health care',
 'publicconsultinggroup': 

In [121]:
companies_list_clean.shape

(67306,)

In [124]:
employer_dict = {}
for employer in match_set:
    for company in companies_list_clean:
        if fuzz.ratio(employer, company) >= 85:
            employer_dict[employer] = company

In [131]:
employer_dict_list = list(employer_dict.keys())
company_dict_list = list(employer_dict.values())

In [133]:
len(employer_dict_list)

5337

In [134]:
len(company_dict_list)

5337

In [145]:
def com_search(name):
    if name in employer_dict.keys():
        industry = df_companies[df_companies['Company_clean'] == employer_dict[name]]['Industry'].values[0]
        return industry

In [146]:
df_contribuion['Industry_new'] = df_contribuion['Employer_clean'].apply(com_search)

In [149]:
df_contribuion['Industry_new'].value_counts().sum()

22383

In [158]:
df_contribuion.drop(columns=['Employer_clean'], inplace=True)

In [160]:
df_contribuion.shape

(118461, 24)

In [161]:
df_contribuion.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Address,Amount,City,Contributor_ID,Contributor_Type,Date,Employer,First_Name,...,Occupation,Principal_Officer,Report_ID,State,Zip,Donor_Name,Industry,PAC,Lobbyist,Industry_new
0,0,7373,270 S. Common Street,500.0,Lynn,,Individual,2015-01-01,McGrath Enterprises,Patrick,...,Developer,,217037,MA,1905,PATRICK MCGRATH,,,False,
1,1,7374,186 Perkins Row,250.0,Topsfield,,Individual,2015-01-15,Self,Thomas,...,Attorney,,217037,MA,1983,THOMAS DEMAKIS,Healthcare,,False,professional training & coaching
2,2,7634,11 Lake Ave.,25.0,Lynn,,Individual,2015-03-02,,Loretta,...,,,217124,MA,10904,LORETTA O'DONNELL,,,False,
3,3,7635,7 William St.,50.0,Lynn,,Individual,2015-03-02,,Stephen,...,,,217124,MA,1904,STEPHEN SPENCER,,,False,
4,4,7636,20 Bulfinch Ter.,50.0,Lynn,,Individual,2015-03-02,,Ann Marie,...,,,217124,MA,1902,ANN MARIE LEONARD,,,False,


In [162]:
df_contribuion.to_csv('data/contribution_experiment.csv')