# Import Libraries

In [831]:
import pandas as pd
pd.set_option('display.max_columns', None)

# Importind DataFrames

In [832]:
%store -r all_verticals_deals_df
%store -r contacts_df
%store -r events_df

# Mapping to 'Contact' Target Data Model

In [833]:
# Define the Contacts columns
columns = [
    "Id",
    "ContactName",
    "CompanyId",
    "JobTitle",
    "Email",
    "PrimaryPhoneNumber",
    "SecondaryPhoneNumber",
    "Region",
    "VerticalId",
    "SubVerticalId",
    "Birthday",
    "CoveragePersonId",
    "PreferredContactMethodId",
    "AdditionalDescription",
    "CreatedAt",
    "CreatedBy",
    "UpdatedAt",
    "UpdatedBy",
    "File"
]

# Create an empty DataFrame with the columns
contact_df = pd.DataFrame(columns=columns)


In [834]:
# Populate the Contacts DataFrame with Contact Information
transform_contacts_df = pd.DataFrame()

transform_contacts_df['ContactName'] = contacts_df['Name']
transform_contacts_df['CompanyId'] = contacts_df['Firm']
transform_contacts_df['JobTitle'] = contacts_df['Title']
transform_contacts_df['Email'] = contacts_df['E-mail']
transform_contacts_df['PrimaryPhoneNumber'] = contacts_df['Phone']
transform_contacts_df['SecondaryPhoneNumber'] = contacts_df['Secondary Phone']
transform_contacts_df['Region'] = contacts_df['City']
transform_contacts_df['VerticalId'] = contacts_df['Group']
transform_contacts_df['SubVerticalId'] = contacts_df['Sub-Vertical']
transform_contacts_df['Birthday'] = contacts_df['Birthday']
transform_contacts_df['CoveragePersonId'] = contacts_df['Coverage Person']
transform_contacts_df['PreferredContactMethodId'] = contacts_df['Preferred Contact Method']
transform_contacts_df['File'] = "contacts"

In [835]:
# Populate the Contacts DataFrame with Contact Information
transform_events_df = pd.DataFrame()

transform_events_df['ContactName'] = events_df['Name']
transform_events_df['Email'] = events_df['E-mail']
transform_events_df['File'] = "events"

In [836]:
# Populate the Contacts DataFrame with Deal Vertical Information
transform_dealContact_df = pd.DataFrame()

transform_dealContact_df['ContactName'] = all_verticals_deals_df['Banker']
transform_contacts_df['CompanyId'] = all_verticals_deals_df['Invest. Bank']
transform_dealContact_df['Email'] = all_verticals_deals_df['Banker Email']
transform_dealContact_df['PrimaryPhoneNumber'] = all_verticals_deals_df['Banker Phone Number']
transform_dealContact_df['File'] = "all_deal_verticals"

In [837]:
# Concatenate all transformed DataFrames into a single DataFrame
all_contact_transformed_df = pd.concat([
    transform_contacts_df,
    transform_events_df,
    transform_dealContact_df
], ignore_index=True)

In [838]:
all_contact_transformed_df = all_contact_transformed_df[~(all_contact_transformed_df['ContactName'].isna() | (all_contact_transformed_df['ContactName'] == ''))]
all_contact_transformed_df = all_contact_transformed_df.reset_index(drop=True)

In [839]:
all_contact_transformed_df = all_contact_transformed_df.fillna('')

In [840]:
#Display the first few rows of the transformed DataFrame
all_contact_transformed_df

Unnamed: 0,ContactName,CompanyId,JobTitle,Email,PrimaryPhoneNumber,SecondaryPhoneNumber,Region,VerticalId,SubVerticalId,Birthday,CoveragePersonId,PreferredContactMethodId,File
0,Robert Baltimore,Harris Williams,Managing Director,BBaltimore@harriswilliams.com,(804) 648-0072,,"Richmond, VA",Business Services,Business Services,2/25/1966,Hannah Jumper,Email,contacts
1,Brian Lucas,,Managing Director,blucas@harriswilliams.com,(804) 648-0072,,"Richmond, VA",Business Services,Business Services,9/3/1953,Kripa Shah,Business Phone,contacts
2,Luke Semple,,Managing Director,lsemple@harriswilliams.com,(804) 648-0072,,"Richmond, VA",Business Services,Business Services,3/27/1962,Emily Royal,Cell Phone,contacts
3,Drew Spitzer,Barclays,Managing Director,aspitzer@harriswilliams.com,(804) 648-0072,,"Richmond, VA",Business Services,Business Services,4/28/1964,Russ Barner,Business Phone,contacts
4,Derek Lewis,"Baird, Jefferies",Managing Director,dlewis@harriswilliams.com,(804) 648-0072,,"Richmond, VA",Business Services,Business Services,4/24/1971,Daniel Ding,Cell Phone,contacts
...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,Paul Jevnick,,,Paul Jevnick@BMO .com,793-260-4783,,,,,,,,all_deal_verticals
483,Dan Grabos,,,Dan Grabos@JP Morgan .com,309-141-5134,,,,,,,,all_deal_verticals
484,Karen Martin,,,Karen Martin@BMO .com,861-530-1192,,,,,,,,all_deal_verticals
485,Richard Agabs,,,Richard Agabs@Jefferies .com,434-955-4824,,,,,,,,all_deal_verticals


In [841]:
#Check for each column in the transformed DataFrame is part of the original Company DataFrame
if not set(all_contact_transformed_df.columns).issubset(contact_df.columns):
    raise ValueError(f"all_contact_transformed_df has columns not in contact_df: {set(all_company_transformed_df.columns) - set(contact_df.columns)}")
else:
    print("All columns matched successfully.")

All columns matched successfully.


# Data Cleansing

In [842]:
all_contact_transformed_df['ContactName'].value_counts()

Andrew Verdasca     3
Mark Pinsky         3
Cristiano Lima      3
Rex Green           3
Mauro Souza         3
                   ..
Thomas Crowley      1
Jack Rabun          1
Rahul Singla        1
Tommaso Zanobini    1
Geoff Smith         1
Name: ContactName, Length: 389, dtype: int64

In [843]:
# Group by 'all_company_transformed_df' and concatenate the results
all_contact_df = all_contact_transformed_df.groupby(['ContactName','Email']).agg(lambda x: ' '.join(x.astype(str))).reset_index()
all_contact_df

Unnamed: 0,ContactName,Email,CompanyId,JobTitle,PrimaryPhoneNumber,SecondaryPhoneNumber,Region,VerticalId,SubVerticalId,Birthday,CoveragePersonId,PreferredContactMethodId,File
0,Aaron Engen,Aaron.Engen@BMO.com,William Blair,"Managing Director, Global Head",403-515-1560,,"Toronto, CA",Business Services,Business Services,5/15/1957,Russ Barner,Cell Phone,contacts
1,Aaron Kaplan,aaron.kaplan@credit-suisse.com,William Blair,Director,312 750-2955,,"Chicago, IL",Business Services,Industrial & Environmental Services,7/3/1970,Daniel Ding,Business Phone,contacts events
2,Adam Dzaia,aczaia@rwbaird.com,JP Morgan,Managing Director,414-298-7358,,"Milwaukee, WI",Business Services,Business Services,5/18/1969,Jeannie Blackwood,Email,contacts events
3,Adam Filkin,Adam Filkin@William Blair.com,,,808-881-5600,,,,,,,,all_deal_verticals
4,Adam Nordin,adam.nordin@barclays.com,Morgan Stanley,"Managing Director, Head of Business Services",312-609-7223,,"Chicago, IL",Technology,Data & Information Services,10/28/1957,Kripa Shah,Email,contacts
...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,Wes Harrington,Wes.Harrington@BMO.com,Direct to FIS,Director,312-461-6531,,"Chicago, IL",Business Services,Industrial & Environmental Services,6/3/1966,Emily Royal,Business Phone,contacts
392,Whitney Horne,WHorne@jefferies.com,Goldman Sachs,Managing Director,,,"New York, NY",Business Services,Business Services,7/22/1972,Daniel Ding,Cell Phone,contacts events events
393,Will Koo,Will.Koo@BMO.com,Piper Jaffray,Director,212-702-1198,,"New York, NY",Financial Services,Insurance,1/11/1951,Hannah Jumper,Cell Phone,contacts
394,Xavier Loriferne,xavier.c.loriferne@jpmorgan.com,Houlihan Lokey,Managing Director,,,"New York, NY",Financial Services,Product Group,4/9/1958,Daniel Ding,Email,contacts


In [844]:
all_contact_df[['ContactName','Email']].value_counts()

ContactName       Email                        
Aaron Engen       Aaron.Engen@BMO.com              1
Melissa Wasser    Melissa.Wasser@ftpartners.com    1
Mauro Souza       MSouza@jefferies.com             1
Matthias Kirstol  Matthias.Kirstol@jpmorgan.com    1
Matthew Stopnik   Matthew.Stopnik@rbbcm.com        1
                                                  ..
Dick Burke        rburke@jefferies.com             1
Dhiren Shah       dhiren.shah@credit-suisse.com    1
Devanshu Dhyani   devanshu.dhyani@gs.com           1
Derek Lewis       dlewis@harriswilliams.com        1
Zach Pfanstiel    zach.pfanstiel@jpmorgan.com      1
Length: 396, dtype: int64

In [845]:
# Fill NaN values with empty strings
all_contact_df = all_contact_df.fillna('')

In [846]:
all_contact_df['Id'] = ['CT_{:03d}'.format(i) for i in range(1, len(all_contact_df) + 1)]

In [847]:
all_contact_df

Unnamed: 0,ContactName,Email,CompanyId,JobTitle,PrimaryPhoneNumber,SecondaryPhoneNumber,Region,VerticalId,SubVerticalId,Birthday,CoveragePersonId,PreferredContactMethodId,File,Id
0,Aaron Engen,Aaron.Engen@BMO.com,William Blair,"Managing Director, Global Head",403-515-1560,,"Toronto, CA",Business Services,Business Services,5/15/1957,Russ Barner,Cell Phone,contacts,CT_001
1,Aaron Kaplan,aaron.kaplan@credit-suisse.com,William Blair,Director,312 750-2955,,"Chicago, IL",Business Services,Industrial & Environmental Services,7/3/1970,Daniel Ding,Business Phone,contacts events,CT_002
2,Adam Dzaia,aczaia@rwbaird.com,JP Morgan,Managing Director,414-298-7358,,"Milwaukee, WI",Business Services,Business Services,5/18/1969,Jeannie Blackwood,Email,contacts events,CT_003
3,Adam Filkin,Adam Filkin@William Blair.com,,,808-881-5600,,,,,,,,all_deal_verticals,CT_004
4,Adam Nordin,adam.nordin@barclays.com,Morgan Stanley,"Managing Director, Head of Business Services",312-609-7223,,"Chicago, IL",Technology,Data & Information Services,10/28/1957,Kripa Shah,Email,contacts,CT_005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,Wes Harrington,Wes.Harrington@BMO.com,Direct to FIS,Director,312-461-6531,,"Chicago, IL",Business Services,Industrial & Environmental Services,6/3/1966,Emily Royal,Business Phone,contacts,CT_392
392,Whitney Horne,WHorne@jefferies.com,Goldman Sachs,Managing Director,,,"New York, NY",Business Services,Business Services,7/22/1972,Daniel Ding,Cell Phone,contacts events events,CT_393
393,Will Koo,Will.Koo@BMO.com,Piper Jaffray,Director,212-702-1198,,"New York, NY",Financial Services,Insurance,1/11/1951,Hannah Jumper,Cell Phone,contacts,CT_394
394,Xavier Loriferne,xavier.c.loriferne@jpmorgan.com,Houlihan Lokey,Managing Director,,,"New York, NY",Financial Services,Product Group,4/9/1958,Daniel Ding,Email,contacts,CT_395


In [848]:
# Concatenate result DataFrames with company_df to bring all columns together
contact_transformed= pd.concat([
    contact_df,
    all_contact_df
], ignore_index=True)

In [849]:
# Fill NaN values with empty strings
contact_transformed = contact_transformed.fillna('')

In [850]:
# Trim all string values
contact_transformed = contact_transformed.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Validate Contact Data Model

In [851]:
#Check for each column in the transformed DataFrame is part of the original Company DataFrame
if not set(contact_df.columns).issubset(contact_transformed.columns):
    raise ValueError(f"contact_df has columns not in contact_transformed: {set(contact_df.columns) - set(contact_transformed.columns)}")
else:
    print("All columns matched successfully.")

All columns matched successfully.


In [852]:
contact_transformed = contact_transformed.drop('File', axis=1)

In [853]:
contact_transformed

Unnamed: 0,Id,ContactName,CompanyId,JobTitle,Email,PrimaryPhoneNumber,SecondaryPhoneNumber,Region,VerticalId,SubVerticalId,Birthday,CoveragePersonId,PreferredContactMethodId,AdditionalDescription,CreatedAt,CreatedBy,UpdatedAt,UpdatedBy
0,CT_001,Aaron Engen,William Blair,"Managing Director, Global Head",Aaron.Engen@BMO.com,403-515-1560,,"Toronto, CA",Business Services,Business Services,5/15/1957,Russ Barner,Cell Phone,,,,,
1,CT_002,Aaron Kaplan,William Blair,Director,aaron.kaplan@credit-suisse.com,312 750-2955,,"Chicago, IL",Business Services,Industrial & Environmental Services,7/3/1970,Daniel Ding,Business Phone,,,,,
2,CT_003,Adam Dzaia,JP Morgan,Managing Director,aczaia@rwbaird.com,414-298-7358,,"Milwaukee, WI",Business Services,Business Services,5/18/1969,Jeannie Blackwood,Email,,,,,
3,CT_004,Adam Filkin,,,Adam Filkin@William Blair.com,808-881-5600,,,,,,,,,,,,
4,CT_005,Adam Nordin,Morgan Stanley,"Managing Director, Head of Business Services",adam.nordin@barclays.com,312-609-7223,,"Chicago, IL",Technology,Data & Information Services,10/28/1957,Kripa Shah,Email,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,CT_392,Wes Harrington,Direct to FIS,Director,Wes.Harrington@BMO.com,312-461-6531,,"Chicago, IL",Business Services,Industrial & Environmental Services,6/3/1966,Emily Royal,Business Phone,,,,,
392,CT_393,Whitney Horne,Goldman Sachs,Managing Director,WHorne@jefferies.com,,,"New York, NY",Business Services,Business Services,7/22/1972,Daniel Ding,Cell Phone,,,,,
393,CT_394,Will Koo,Piper Jaffray,Director,Will.Koo@BMO.com,212-702-1198,,"New York, NY",Financial Services,Insurance,1/11/1951,Hannah Jumper,Cell Phone,,,,,
394,CT_395,Xavier Loriferne,Houlihan Lokey,Managing Director,xavier.c.loriferne@jpmorgan.com,,,"New York, NY",Financial Services,Product Group,4/9/1958,Daniel Ding,Email,,,,,


# Export Companies Data

In [854]:
%store contact_transformed

Stored 'contact_transformed' (DataFrame)
