# Clean a JSON LinkedIn profile file
Takes a JSON file that was scrapped with data from LinkedIn and makes it ready to be used. Converts the _position_ titles, normalizes all the text cases. This should be used on a raw JSON scrapped file as first set before adapting it to Dropcontact format.

In [1]:
import os
import re
import pandas
import airtable
from marketing_data_cleaning import DATA_FOLDER_PATH

In [2]:
df = pandas.read_json(
    DATA_FOLDER_PATH / 'chained_output.json',
    orient='records',
    encoding='utf-8'
)


In [3]:
df['enriched'] = 'Non enrichi'

In [4]:
df['company'].describe()

count                                                 3854
unique                                                  48
top       (24) Worldpanel by Kantar : personnes | LinkedIn
freq                                                   824
Name: company, dtype: object

## Clean main data
Make cases title case (especially first_name and last_name)

In [29]:
def normalize_names(value):
    return str(value).lower().title()


columns_to_normalize = ['last_name', 'first_name', 'full_name']

for column in columns_to_normalize:
    df[column] = df[column].apply(normalize_names)


In [30]:
def get_company_title(value):
    result = re.match(r'^\(\d+\)\s(.*)\s?\:', str(value))
    if result:
        return result.group(1).strip()
    return value

df['company'] =  df['company'].apply(get_company_title)

In [31]:
df = df.drop_duplicates()

In [32]:
df.to_csv(DATA_FOLDER_PATH / 'clean_linkedin_preview_profiles.csv', index=False)
