# Dropcontact adapter
Adapts a given file by cleaning and correcting the column names so that it can be used with "Dropcontact". All the default fields are implemented in the dataframe

In [69]:
import pandas
from marketing_data_cleaning import BASE_COLUMNS, DATA_FOLDER_PATH
from urllib.parse import urlparse, urlunparse



In [70]:
COLUMNS_TO_KEEP = ['linkedin', 'noms', 'poste']

df = pandas.read_csv(DATA_FOLDER_PATH / 'cto_linkedin_profiles.csv', encoding='utf-8')
df = df[COLUMNS_TO_KEEP]


In [71]:
def clean_urls(value):
    url = urlparse(str(value))
    return urlunparse((url.scheme, url.netloc, url.path, None, None, None))

df['linkedin'] = df['linkedin'].apply(clean_urls)

In [72]:
df = df.rename(columns={'noms': 'full_name', 'poste': 'position'})

In [73]:
df['linkedin'].describe()

count                                                  2127
unique                                                 2126
top       https://www.linkedin.com/in/philippe-laurent-7...
freq                                                      2
Name: linkedin, dtype: object

## Add missing columns
Include the missing columns to the dataframe

In [74]:
current_columns = set(df.columns)
missing_columns = BASE_COLUMNS.difference(current_columns)

for column in missing_columns:
    df[column] = None

## Clean values

__firstname__ and __lastname__ should be title cased in the same was as __position__.

In [75]:
def set_firstname(value):
    result = str(value).split(' ')[0]
    return result.lower().title()


def set_lastname(value):
    result = str(value).split(' ')[-1]
    return result.lower().title()


df['first_name'] = df['full_name'].apply(set_firstname)
df['last_name'] = df['full_name'].apply(set_lastname)

Make sure that the __fullname__ is also title cased and stripped

In [76]:
def clean_position(value):
    return str(value).lower().title()

def clean_fullname(value):
    return str(value).lower().title()

df['position'] = df['position'].apply(clean_position)
df['full_name'] = df['full_name'].apply(clean_fullname)

In [77]:
df.to_csv(DATA_FOLDER_PATH / 'cto.csv', encoding='utf-8', index=False)