# Clean a JSON LinkedIn profile file
Takes a JSON file that was scrapped with data from LinkedIn and makes it ready to be used. Converts the _position_ titles, normalizes all the text cases. This should be used on a raw JSON scrapped file as first set before adapting it to Dropcontact format.

In [95]:
import re
import json
import pandas
from marketing_data_cleaning import DATA_FOLDER_PATH
from itertools import chain
import pathlib
import secrets
import datetime


In [96]:

def chain_for_dataframe(filename, write_file=True):
    """
    Regroups subsets of dictionnaries together

    >>> [[{...}], [{...}]]
    ... [{...}, {...}]
    """
    with open(DATA_FOLDER_PATH / filename, mode='r', encoding='utf-8') as f:
        chained_values = list(chain(*json.load(f)))
        df = pandas.DataFrame(chained_values)

        if write_file:
            df.to_json(
                DATA_FOLDER_PATH / 'chained_output.json',
                force_ascii=False,
                orient='records'
            )
        return df


df = chain_for_dataframe(DATA_FOLDER_PATH / 'inputs/v8.json')


In [97]:
df['company'].describe()


count                                      152
unique                                       5
top       (13) Glastint : personnes | LinkedIn
freq                                        56
Name: company, dtype: object

## Clean main data
Make cases title case (especially first_name and last_name)

In [98]:
def normalize_names(value):
    return str(value).lower().title()


columns_to_normalize = ['last_name', 'first_name', 'full_name']

for column in columns_to_normalize:
    df[column] = df[column].apply(normalize_names)


In [99]:
def extract_company(value):
    result = re.match(r'^\(\d+\)\s(.*)\s?\:', str(value))
    if result:
        return result.group(1).strip()
    return value


df['company'] = df['company'].apply(extract_company)


In [100]:
# for item in df.itertuples():
#     regexes = [
#         r'chez\s?\w+',
#         rf'{item.company}'
#     ]

#     new_value = None
#     for regex in regexes:
#         result = re.search(regex, item.position)
#         if result is None:
#             continue
#         new_value = re.sub(regex, '', item.position)
#         df.loc[item.Index, 'position'] = new_value or item.position


In [101]:
df = df.drop_duplicates(subset=['first_name', 'last_name'])


In [102]:
df = df.sort_values('last_name')


# Airtable adapter
Adapts a Dropcontact adapated CSV file by cleaning and correcting the column names so that it can eventually be uploaded to an Airtable base.

In [103]:
COLUMNS_TO_ADAPT = {
    'linkedin': 'LinkedIn',
    'courtesy_title': 'Civilité',
    'first_name': 'Prénom',
    'last_name': 'Nom',
    'full_name': 'Nom complet',
    'position': 'Poste',
    'company': 'Entreprise',
    'company_linkedin': 'Company LinkedIn',
    'enriched': 'Statut enrichissement',
    'email': 'Email',
    'website': 'Site entreprise',
    'company_metadata': 'Company metadata',
    'company_members': 'Company members',
    'company_description': 'Description'
}
airtable_df = df.rename(columns=COLUMNS_TO_ADAPT)


In [104]:
d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
filename = f"v_{d.replace(' ', '_').replace(':', '-')}"


In [105]:
airtable_df.to_csv(DATA_FOLDER_PATH / f'db/{filename}.csv', index=False)
