# Dropcontact adapter
Adapts a given CSV file by cleaning and correcting the column names so that it can be used with "Dropcontact". All the default fields are implemented in the dataframe

In [121]:
import pandas
from marketing_data_cleaning import BASE_COLUMNS, DATA_FOLDER_PATH
from urllib.parse import urlparse, urlunparse



In [122]:
COLUMNS_TO_KEEP = []

FILENAME = 'adapt_to_dropcontact'

df = pandas.read_csv(DATA_FOLDER_PATH / f'inputs/{FILENAME}.csv', encoding='utf-8', sep=';')

if COLUMNS_TO_KEEP:
    df = df[COLUMNS_TO_KEEP]

In [123]:
def clean_urls(value):
    url = urlparse(str(value))
    if url.scheme == '':
        return None
    return urlunparse((url.scheme, url.netloc, url.path, None, None, None))

df['linkedin'] = df['linkedin'].apply(clean_urls)

In [124]:
df = df.rename(columns={'site': 'website', 'entreprise': 'company'})

In [125]:
df['linkedin'].describe()

count       0
unique      0
top       NaN
freq      NaN
Name: linkedin, dtype: object

## Add missing columns
Include the missing columns to the dataframe

In [126]:
current_columns = set(df.columns)
missing_columns = BASE_COLUMNS.difference(current_columns)

for column in missing_columns:
    df[column] = None
    
missing_columns

{'company_linkedin', 'enriched', 'position'}

## Clean values

__firstname__ and __lastname__ should be title cased in the same was as __position__.

In [127]:
def set_firstname(value):
    if value is None:
        return None
    result = str(value).split(' ')[0]
    return result.lower().title()


def set_lastname(value):
    if value is None:
        return None
    result = str(value).split(' ')[-1]
    return result.lower().title()


df['first_name'] = df['full_name'].apply(set_firstname)
df['last_name'] = df['full_name'].apply(set_lastname)

Make sure that the __fullname__ is also title cased and stripped

In [128]:
def clean_position(value):
    if value is None:
        return None
    return str(value).lower().title()

def clean_fullname(value):
    if value is None:
        return None
    return str(value).lower().title()

df['position'] = df['position'].apply(clean_position)
df['full_name'] = df['full_name'].apply(clean_fullname)

In [129]:
df.head()

Unnamed: 0,first_name,last_name,full_name,email,company,website,linkedin,enriched,company_linkedin,position
0,,,,4.bi@orange.fr,,,,,,
1,Aude,Guilbon,Aude Guilbon,a-guilbon@neoform.fr,,,,,,
2,Aline,Aledo,Aline Aledo,a.aledo@domespharma.com,,,,,,
3,Anne,Badea,Anne Badea,a.badea@dargaud.fr,,,,,,
4,Arnaud,Ballet,Arnaud Ballet,a.ballet@cr-auvergne.fr,,,,,,


In [130]:
df.to_csv(
    DATA_FOLDER_PATH / 'clean_linkedin_preview_profiles.csv',
    encoding='utf-8', 
    index=False
)
