# Dropcontact adapter
Adapts a given CSV file by cleaning and correcting the column names so that it can be used with "Dropcontact". All the default fields are implemented in the dataframe. This should be run first on files that were first not optimized to use with Dropcontact and in order to enrich the data from the file.

In [1]:
import pandas
from marketing_data_cleaning import BASE_COLUMNS, DATA_FOLDER_PATH
from urllib.parse import urlparse, urlunparse



In [2]:
COLUMNS_TO_KEEP = []

FILENAME = 'adapt_to_dropcontact'

df = pandas.read_csv(DATA_FOLDER_PATH / f'inputs/{FILENAME}.csv', encoding='utf-8')

if COLUMNS_TO_KEEP:
    df = df[COLUMNS_TO_KEEP]

In [3]:
def clean_urls(value):
    url = urlparse(str(value))
    if url.scheme == '':
        return None
    return urlunparse((url.scheme, url.netloc, url.path, None, None, None))

df['linkedin'] = df['linkedin'].apply(clean_urls)

KeyError: 'linkedin'

In [37]:
df = df.rename(columns={'site': 'website', 'entreprise': 'company'})

In [38]:
df['linkedin'].describe()

count                                          786
unique                                         729
top       https://fr.linkedin.com/in/clementcachot
freq                                             4
Name: linkedin, dtype: object

## Add missing columns
Include the missing columns to the dataframe

In [39]:
current_columns = set(df.columns)
missing_columns = BASE_COLUMNS.difference(current_columns)

for column in missing_columns:
    df[column] = None
    
missing_columns

{'company_linkedin', 'enriched', 'first_name', 'last_name', 'website'}

## Clean values

__firstname__ and __lastname__ should be title cased in the same was as __position__.

In [40]:
def set_firstname(value):
    if value is None:
        return None
    result = str(value).strip().split(' ')[0]
    return result.lower().title()


def set_lastname(value):
    if value is None:
        return None
    result = str(value).strip().split(' ')[-1]
    return result.lower().title()


df['first_name'] = df['full_name'].apply(set_firstname)
df['last_name'] = df['full_name'].apply(set_lastname)

Make sure that the __fullname__ is also title cased and stripped

In [41]:
def clean_position(value):
    if value is None:
        return None
    return str(value).strip().lower().title()

def clean_fullname(value):
    if value is None:
        return None
    return str(value).strip().lower().title()

df['position'] = df['position'].apply(clean_position)
df['full_name'] = df['full_name'].apply(clean_fullname)

In [42]:
df = df[list(BASE_COLUMNS)]


In [43]:
df.to_csv(
    DATA_FOLDER_PATH / 'clean_linkedin_preview_profiles.csv',
    encoding='utf-8', 
    index=False
)
