# NAF / ROME Link Factory
* Utiliser les diverses données pour calculer une table de correspondance rome / naf

## Lancement
Les cellules peuvent être lancées automatiquement ( _run all cells_ ), l'exécution s'arrêtera automatiquement après avoir crée le fichier output: ../outputs/andi_rome2naf_|date|.csv


## Autres idées non exploitées:
- pousser certains domaines naf manuellement reliés aux romes
- exploiter les codes OGR (données insuffisantes)
- obtenir des données plus larges, sur une pluse grande durée (cf DPAE)
- intégrer nombre d'entreprises et leur taille moyenne (sur tout le territoire) (cf. sirene/insee)
- plus dynamique: intégrer variance, écart type, ...
- ...

## A vérifier:
- Anomalie des fromagers: NAF-1051C et ROME-A1412, peu de recrutement mais ils y vont en masse

### Obtention des labels et autres données

In [30]:
## Chargement des labels naf, rome, ogr
from IPython.display import display, HTML
import pandas as pd
import numpy as np

# NAF
naf_labels = pd.read_csv('/ressources/list_NAF_LBB.csv', sep='|', encoding="utf-8")
naf_labels.columns = ['nafdot', 'naf', 'label']
print(f"Obtained {len(naf_labels)} NAF labels")
# display(HTML(naf_labels.head(5).to_html()))

# ROME
rome_labels = pd.read_csv('/ressources/liste_rome_LBB.csv', sep=',', encoding="utf-8")
rome_labels.columns = ['rome', 'rome_1', 'rome_2', 'rome_3', 'label', 'slug']
print(f"Obtained {len(rome_labels)} ROME labels")
# display(HTML(rome_labels.head(5).to_html()))

# Chargement des statistiques d'emploi
emploi_rome_naf = pd.read_csv('/ressources/contrats_30j.csv', sep=',', encoding="utf-8")[['ROME', 'APE700', 'nb_embauches']]
emploi_rome_naf.columns = ['rome', 'naf', 'embauches']
# display(HTML(emploi_rome_naf.head(5).to_html()))
        
    
# Calcul des ratios
naf_embauches = emploi_rome_naf[['naf', 'embauches']].groupby('naf').agg(
    embauches_total_n=pd.NamedAgg(column='embauches', aggfunc=sum)
)
rome_embauches = emploi_rome_naf[['rome', 'embauches']].groupby('rome').agg(
    embauches_total_r=pd.NamedAgg(column='embauches', aggfunc=sum)
)


naf_i = (
emploi_rome_naf
    .merge(naf_embauches, on='naf')
    .merge(rome_embauches, on='rome')
)
naf_i['ratio_naf'] = ( naf_i.embauches / naf_i.embauches_total_n ) * 100
naf_i['ratio_rome'] = ( naf_i.embauches / naf_i.embauches_total_r ) * 100

naf_i = naf_i.sort_values(by=['ratio_naf'], ascending=False)
naf_i.columns


Obtained 732 NAF labels
Obtained 531 ROME labels


Index(['rome', 'naf', 'embauches', 'embauches_total_n', 'embauches_total_r',
       'ratio_naf', 'ratio_rome'],
      dtype='object')

### Fonction de calcul du tableau de correspondance ROME vers NAF

In [106]:
# Rome=>Naf in function form
# Deprecated
def rome2nafz_v1(rome, df_in, output_size=15):
    """
    Generate naf list using input rome code and specific dataframe:
    ['rome', 'naf', 'embauches', 'embauches_total_n', 'embauches_total_r', 'ratio_naf', 'ratio_rome']
    
    - First a table is generated listing the naf codes who employ the highest percentage
    (from their total recruitments) of the specified rome code, a score is specified with
    a binning technique
    - Then the NAF codes who also appear in the list of where the specified rome code tends
    to work at are boosted
    - Finaly the result is filtered keeping only those with a score of 1 or higher, and from those
    only the first 15 are kept (can be changed by parameter)
    """
    df_nf = (
    df_in.loc[df_in.rome == rome]
         .sort_values(by=['ratio_naf'], ascending=False)
    )
    df_nf['cumsum'] =  100 - df_nf['ratio_rome'].cumsum()
    df_nf = df_nf.loc[df_nf['cumsum'] > 25]
    df_nf['score'] = pd.to_numeric(pd.cut(
        df_nf['ratio_naf'],
        bins=[1, 2, 5, 10, 50, 100],
        labels=[0, 1, 2, 3, 4]
    ))
    df_nf['score_rome'] = pd.to_numeric(pd.cut(
        df_nf['ratio_rome'],
        bins=[1, 2, 5, 10, 50, 100],
        labels=[0, 1, 2, 3, 4]
    ))
    
    df_rs = (
    df_in.loc[naf_i.rome == rome ]
         .sort_values(by=['ratio_rome'], ascending=False)
    )
    df_rs['cumsum'] =  100 - df_rs['ratio_rome'].cumsum()
    df_rs = df_rs.head(10)
     
    df_nf.loc[df_nf['naf'].isin(df_rs['naf']), 'score'] += 1
    df_nf = df_nf.loc[df_nf['score'].ge(1)]

    
    return (
        df_nf.sort_values(by=['score'], ascending=False)
             .head(output_size)
    )[['rome', 'naf', 'score']]

In [134]:
# Rome=>Naf in function form
# Latest
def rome2nafz_v2(rome, df_in, output_size=15):
    """
    Generate naf list using input rome code and specific dataframe:
    ['rome', 'naf', 'embauches', 'embauches_total_n', 'embauches_total_r', 'ratio_naf', 'ratio_rome']
    
    - First a table is generated listing the naf codes who employ the highest percentage
    (from their total recruitments) of the specified rome code, a score is specified with
    a binning technique
    - Then the NAF codes who also appear in the list of where the specified rome code tends
    to work at are boosted
    - Finaly the result is filtered keeping only those with a score of 1 or higher, and from those
    only the first 15 are kept (can be changed by parameter)
    """
    df_nf = (
    df_in.loc[df_in.rome == rome]
         .sort_values(by=['ratio_naf'], ascending=False)
    )
    df_nf['cumsum'] =  100 - df_nf['ratio_rome'].cumsum()
    #df_nf = df_nf.loc[df_nf['cumsum'] > 25]
    df_nf['score_naf'] = pd.to_numeric(pd.cut(
        df_nf['ratio_naf'],
        bins=[0, 2, 5, 10, 50, 100],
        labels=[0, 1, 2, 3, 4]
    ))
    df_nf['score_rome'] = pd.to_numeric(pd.cut(
        df_nf['ratio_rome'],
        bins=[0, 2, 5, 10, 50, 100],
        labels=[0, 1, 2, 3, 4]
    ))
    
    df_nf = df_nf.loc[df_nf['score_naf'].ge(1) | df_nf['score_rome'].ge(1)]
    df_nf['score'] = df_nf['score_naf'] * 4 + df_nf['score_rome']
    df_nf['score'] = pd.cut(
        df_nf['score_naf'] * 3 + df_nf['score_rome'],
        bins=5,
        labels=[1,2,3,4,5]
    )
    
    return (
        df_nf.sort_values(by=['score'], ascending=False)
             .head(output_size)
    )[['rome', 'naf', 'score']]

### Vérification de la fonction

In [130]:
# Function V1
ROME = 'D1101'
print(f"Table ROME=>NAF pour \"{rome_labels.loc[(rome_labels['rome'] == ROME, 'label')].iloc[0]}\"")
df  = rome2nafz_v1(ROME, naf_i)
df.merge(naf_labels, on='naf')[['naf', 'label','score']]
assert(len(df) == 15)
df.head(5)

Table ROME=>NAF pour "Boucherie"


Unnamed: 0,rome,naf,score
12720,D1101,4722Z,4.0
12883,D1101,1013B,3.0
12562,D1101,1011Z,3.0
12696,D1101,4632A,3.0
12716,D1101,4711D,2.0


In [136]:
# Function Test
df  = rome2nafz_v2(ROME, naf_i)
df.merge(naf_labels, on='naf')[['naf', 'label','score']]
assert(len(df) == 15)
df.head(5)

Unnamed: 0,rome,naf,score
12720,D1101,4722Z,5
12562,D1101,1011Z,4
12696,D1101,4632A,4
12883,D1101,1013B,3
12716,D1101,4711D,3


### Calcul de pour l'ensemble des codes ROME

In [137]:
result_table = {}
for index, row in rome_labels.iterrows():
    result_table[row.rome] = rome2nafz_v2(str(row.rome), naf_i).to_dict('r')
    print(f"row {index} : {len(result_table[row.rome])} Naf codes for Rome {row.rome} \"{rome_labels.loc[(rome_labels['rome'] == row.rome, 'label')].iloc[0]}\" ")

row 0 : 15 Naf codes for Rome A1101 "Conduite d'engins agricoles et forestiers" 
row 1 : 11 Naf codes for Rome A1201 "Bûcheronnage et élagage" 
row 2 : 7 Naf codes for Rome A1202 "Entretien des espaces naturels" 
row 3 : 15 Naf codes for Rome A1203 "Entretien des espaces verts" 
row 4 : 8 Naf codes for Rome A1204 "Protection du patrimoine naturel" 
row 5 : 8 Naf codes for Rome A1205 "Sylviculture" 
row 6 : 13 Naf codes for Rome A1301 "Conseil et assistance technique en agriculture" 
row 7 : 9 Naf codes for Rome A1302 "Contrôle et diagnostic technique en agriculture" 
row 8 : 15 Naf codes for Rome A1303 "Ingénierie en agriculture et environnement naturel" 
row 9 : 15 Naf codes for Rome A1401 "Aide agricole de production fruitière ou viticole" 
row 10 : 15 Naf codes for Rome A1402 "Aide agricole de production légumière ou végétale" 
row 11 : 15 Naf codes for Rome A1403 "Aide d'élevage agricole et aquacole" 
row 12 : 11 Naf codes for Rome A1404 "Aquaculture" 
row 13 : 13 Naf codes for Rom

row 111 : 9 Naf codes for Rome D1508 "Encadrement du personnel de caisses" 
row 112 : 6 Naf codes for Rome D1509 "Management de département en grande distribution" 
row 113 : 9 Naf codes for Rome E1101 "Animation de site multimédia" 
row 114 : 10 Naf codes for Rome E1102 "Ecriture d'ouvrages, de livres" 
row 115 : 15 Naf codes for Rome E1103 "Communication" 
row 116 : 13 Naf codes for Rome E1104 "Conception de contenus multimédias" 
row 117 : 9 Naf codes for Rome E1105 "Coordination d'édition" 
row 118 : 15 Naf codes for Rome E1106 "Journalisme et information média" 
row 119 : 14 Naf codes for Rome E1107 "Organisation d'évènementiel" 
row 120 : 11 Naf codes for Rome E1108 "Traduction, interprétariat" 
row 121 : 9 Naf codes for Rome E1201 "Photographie" 
row 122 : 8 Naf codes for Rome E1202 "Production en laboratoire cinématographique" 
row 123 : 8 Naf codes for Rome E1203 "Production en laboratoire photographique" 
row 124 : 6 Naf codes for Rome E1204 "Projection cinéma" 
row 125 : 15 

row 224 : 15 Naf codes for Rome H1502 "Management et ingénierie qualité industrielle" 
row 225 : 15 Naf codes for Rome H1503 "Intervention technique en laboratoire d'analyse industrielle" 
row 226 : 8 Naf codes for Rome H1504 "Intervention technique en contrôle essai qualité en électricité et électronique" 
row 227 : 9 Naf codes for Rome H1505 "Intervention technique en formulation et analyse sensorielle" 
row 228 : 14 Naf codes for Rome H1506 "Intervention technique qualité en mécanique et travail des métaux" 
row 229 : 4 Naf codes for Rome H2101 "Abattage et découpe des viandes" 
row 230 : 15 Naf codes for Rome H2102 "Conduite d'équipement de production alimentaire" 
row 231 : 7 Naf codes for Rome H2201 "Assemblage d'ouvrages en bois" 
row 232 : 10 Naf codes for Rome H2202 "Conduite d'équipement de fabrication de l'ameublement et du bois" 
row 233 : 11 Naf codes for Rome H2203 "Conduite d'installation de production de panneaux bois" 
row 234 : 7 Naf codes for Rome H2204 "Encadrement 

row 317 : 8 Naf codes for Rome I1502 "Intervention en milieu subaquatique" 
row 318 : 10 Naf codes for Rome I1503 "Intervention en milieux et produits nocifs" 
row 319 : 9 Naf codes for Rome I1601 "Installation et maintenance en nautisme" 
row 320 : 9 Naf codes for Rome I1602 "Maintenance d'aéronefs" 
row 321 : 15 Naf codes for Rome I1603 "Maintenance d'engins de chantier, levage, manutention et de machines agricoles" 
row 322 : 15 Naf codes for Rome I1604 "Mécanique automobile et entretien de véhicules" 
row 323 : 10 Naf codes for Rome I1605 "Mécanique de marine" 
row 324 : 14 Naf codes for Rome I1606 "Réparation de carrosserie" 
row 325 : 12 Naf codes for Rome I1607 "Réparation de cycles, motocycles et motoculteurs de loisirs" 
row 326 : 11 Naf codes for Rome J1101 "Médecine de prévention" 
row 327 : 5 Naf codes for Rome J1102 "Médecine généraliste et spécialisée" 
row 328 : 7 Naf codes for Rome J1103 "Médecine dentaire" 
row 329 : 2 Naf codes for Rome J1104 "Suivi de la grossesse et

row 433 : 10 Naf codes for Rome L1202 "Musique et chant" 
row 434 : 12 Naf codes for Rome L1203 "Art dramatique" 
row 435 : 9 Naf codes for Rome L1204 "Arts du cirque et arts visuels" 
row 436 : 8 Naf codes for Rome L1301 "Mise en scène de spectacles vivants" 
row 437 : 15 Naf codes for Rome L1302 "Production et administration spectacle, cinéma et audiovisuel" 
row 438 : 8 Naf codes for Rome L1303 "Promotion d'artistes et de spectacles" 
row 439 : 10 Naf codes for Rome L1304 "Réalisation cinématographique et audiovisuelle" 
row 440 : 9 Naf codes for Rome L1401 "Sportif professionnel" 
row 441 : 9 Naf codes for Rome L1501 "Coiffure et maquillage spectacle" 
row 442 : 9 Naf codes for Rome L1502 "Costume et habillage spectacle" 
row 443 : 6 Naf codes for Rome L1503 "Décor et accessoires spectacle" 
row 444 : 8 Naf codes for Rome L1504 "Éclairage spectacle" 
row 445 : 11 Naf codes for Rome L1505 "Image cinématographique et télévisuelle" 
row 446 : 6 Naf codes for Rome L1506 "Machinerie spe

### Écriture du fichier CSV et fin du traitement

In [138]:
# Write CSV File
import csv
import time
output = f"/outputs/andi_rome2naf_{time.strftime('%Y%m%d')}.csv"
i = 0
with open(output, 'w') as file:
    writer = csv.DictWriter(
        file,
        delimiter=',',
        quotechar='"',
        quoting=csv.QUOTE_MINIMAL,
        fieldnames=['rome', 'rome_label', 'naf', 'naf_label', 'score']
    )
    writer.writeheader()
    for rome, naflist in result_table.items():
        for data in naflist:
            data['rome_label'] = rome_labels.loc[(rome_labels['rome'] == rome, 'label')].iloc[0]
            data['naf_label'] = naf_labels.loc[(naf_labels['naf'] == data['naf'], 'label')].iloc[0]
            writer.writerow(data)
            i += 1
print(f'Wrote {i} rows to {output}')

# Crash to end automatic cell execution
raise SystemExit("Fin traitement automatique")  
    

Wrote 5342 rows to /outputs/andi_rome2naf_20200130.csv


SystemExit: Fin traitement automatique

### Code historique - documentation

In [None]:
# Legacy Code

# df_ok['score'] = 0
# df_ok.loc[df_cumsum < 15, 'score'] = 5
# df_ok['score'] = pd.cut(
 #   df_ok['cumsum'],
 #   bins=[0, 15, 30, 45, 60, 100],
 #   labels=[5, 4, 3, 2, 1]
#)
#df_ok

#df_base['score'] = 0
#df_base.loc[df_base['cumsum'].ge(0), 'score'] = 0
#df_base.loc[df_base['cumsum'].ge(15).lt(30), 'score'] = 1
#df_base.loc[df_base['cumsum'].ge(30).lt(45), 'score'] = 2
#df_base.loc[df_base['cumsum'].ge(45).lt(60), 'score'] = 3
#df_base.loc[df_base['cumsum'].ge(60), 'score'] = 4
#df_base

In [56]:
def display_result(rome, table, naf_labels):
    print(f"Résultat(s) pour {rome} - \"{rome_labels.loc[(rome_labels['rome'] == rome, 'label')].iloc[0]}\"")
    df = pd.DataFrame(table[rome])
    df = df.merge(naf_labels, on='naf')[['naf', 'label','score']]
    display(HTML(df.head(15).to_html()))


# display_result('A1303', result_table, naf_labels)
display_result('N4104', result_table, naf_labels)
# rome2nafz(t1, naf_i)

Résultat(s) pour N4104 - "Courses et livraisons express"


KeyError: 'naf'

In [139]:
## Vérification de la couverture des rome: génère une liste de rome sans corresponces naf,
## avec un résumé des données brutes
romes = []
labels = []
embauches = []
naf_total = []
naf_liste = []

data = []

emploi_rome_naf['naf2'] = emploi_rome_naf['naf']
ren = emploi_rome_naf.groupby('rome').agg(
    embauches_total_r=pd.NamedAgg(column='embauches', aggfunc=sum),
    naf_list=pd.NamedAgg(column='naf', aggfunc=lambda x: ' '.join(x)),
    naf_count = pd.NamedAgg(column='naf2', aggfunc=lambda x: len(x))
)

for index, row in rome_labels.iterrows():
    test_rome = row['rome']
    df = pd.DataFrame(result_table[test_rome])
    if len(df) == 0:
        data.append([
            test_rome,
            row['label'],
            ren.loc[(ren.index == test_rome, 'embauches_total_r')].iloc[0],
            ren.loc[(ren.index == test_rome, 'naf_count')].iloc[0],
            ren.loc[(ren.index == test_rome, 'naf_list')].iloc[0],
        ])

bilan = pd.DataFrame(
    data,
    columns=['romes','labels','embauches','naf_total','naf_list']
)
pd.set_option('display.max_rows', None)
bilan.head(100)
    

Unnamed: 0,romes,labels,embauches,naf_total,naf_list


In [51]:
emploi_rome_naf['naf2'] = emploi_rome_naf['naf']
rome_embauches_naf = emploi_rome_naf.groupby('rome').agg(
    embauches_total_r=pd.NamedAgg(column='embauches', aggfunc=sum),
    naf=pd.NamedAgg(column='naf', aggfunc=lambda x: ' '.join(x)),
    naf_count = pd.NamedAgg(column='naf2', aggfunc=lambda x: len(x))
)

rome_embauches_naf.loc[rome_embauches_naf.index == 'A1101']

Unnamed: 0_level_0,embauches_total_r,naf,naf_count
rome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1101,4653,0111Z 0113Z 0114Z 0119Z 0121Z 0122Z 0123Z 0124...,365


In [22]:
## Andi Matching API Naf criteria build testing
import os

def get_andidata_naflist(romes, include=None, exclude=None):
    # current_dir = os.path.dirname(os.path.abspath(__file__))
    # filename = f"{current_dir}/data_files/{ANDIDATA_FILE}.csv"
    filename="/outputs/andi_rome2naf_20200130.csv"
    
    include_list = []
    exclude_list = []
    with open(filename) as csvfile:
        rdr = csv.DictReader(
            csvfile,
            delimiter=',',
            quotechar='"',
            quoting=csv.QUOTE_MINIMAL,
        )
        for row in rdr:
            if row['rome'] in romes:
                include_list.append(row)
            if include and row['rome'] in include:
                include_list.append(row)
            if exclude and row['rome'] in exclude:
                exclude_list.append(row['naf'])

    include_list[:] = [el for el in include_list if el['naf'] not in exclude_list]
    out_list = {x['naf']: x['score'] for x in include_list}

    return [out_list, {}]

def get_naf_sql(rules):
    '''
    Get sql rules to select naf code
    '''
    codes, domains = rules
    sql = []
    for naf, value in codes.items():
        sql.append(f'WHEN \'{naf}\' THEN {value}')

    if domains:
        sql.append('ELSE CASE substring(e.naf, 0, 3)')
        for naf, value in domains.items():
            value -= 1
            sql.append(f'WHEN \'{naf}\' THEN {value}')

    sql.append('ELSE 1')

    if domains:
        sql.append('END')

    return "\n".join(sql)

rules = get_andidata_naflist('K1707')
print(rules)
print(get_naf_sql(rules))


[{}, {}]
ELSE 1


In [128]:
# Compute best nafs for rome
pd.set_option('display.max_rows', 20)
def display_df(df):
    display(HTML(df.to_html()))
    
ROME = 'K1707' # Policier
ROME = 'M1809' # Information météorologique
ROME = 'D1101' # Boucher
ROME = 'J1202'

print(f"Table ROME=>NAF pour \"{rome_labels.loc[(rome_labels['rome'] == ROME, 'label')].iloc[0]}\"")

# Basic order based on employment percentage
# Use rome recruitment % by naf ratio, stop @ 75% of total rome pool
df_base = (
naf_i.loc[naf_i.rome == ROME]
    .sort_values(by=['ratio_naf'], ascending=False)
)
print('Step 1')
display_df(df_base[df_base.rome == ROME].head(20))

df_base['cumsum'] =  100 - df_base['ratio_rome'].cumsum()

print('Step 1bis: cumsum')
display_df(df_base[df_base.rome == ROME].head(20))

#df_base = df_base.loc[df_base['cumsum'] > 15]
df_base['score_naf'] = pd.to_numeric(pd.cut(
    df_base['ratio_naf'],
    bins=[0, 2, 5, 10, 50, 100],
    labels=[1, 2, 3, 4, 5]
))
df_base['score_rome'] = pd.to_numeric(pd.cut(
    df_base['ratio_rome'],
    bins=[0, 2, 5, 10, 50, 100],
    labels=[1, 2, 3, 4, 5]
))


print('Step 2: prior to boost')
display_df(df_base[df_base.rome == ROME].head(20))

# Boosting scores with rome orientations
# If naf is in top 10 of rome destinations, increment score
#df_rs = (
#naf_i.loc[naf_i.rome == ROME ]
#    .sort_values(by=['ratio_rome'], ascending=False)
#)
#df_rs['cumsum'] =  100 - df_rs['ratio_rome'].cumsum()
#df_rs = df_rs.head(10)
#df_base.loc[df_base['naf'].isin(df_rs['naf']), 'score'] += 1

print('Step 3')
display_df(df_base[df_base.rome == ROME].head(20))

df_base = df_base.loc[df_base['score_naf'].ge(1) | df_base['score_rome'].ge(1)]
df_base['score'] = pd.cut(
    df_base['score_naf'] * 3 + df_base['score_rome'],
    bins=5,
    labels=[1,2,3,4,5]
)
# Display
(
df_base
    .merge(naf_labels, on='naf')
    .sort_values(by=['score'], ascending=False)
    [['rome', 'naf', 'label','score', 'score_naf', 'score_rome']]
).head(15)

Table ROME=>NAF pour "Pharmacie"
Step 1


Unnamed: 0,rome,naf,embauches,embauches_total_n,embauches_total_r,ratio_naf,ratio_rome
166157,J1202,4773Z,6066,18976,7410,31.966695,81.862348
166174,J1202,9492Z,3,155,7410,1.935484,0.040486
166132,J1202,4774Z,34,3894,7410,0.873138,0.458839
166043,J1202,4646Z,65,8524,7410,0.762553,0.877193
166080,J1202,7731Z,1,177,7410,0.564972,0.013495
166079,J1202,7729Z,53,10541,7410,0.502799,0.71525
166170,J1202,8720B,3,709,7410,0.423131,0.040486
166101,J1202,8610Z,453,131011,7410,0.345772,6.11336
166129,J1202,2120Z,42,13627,7410,0.308212,0.566802
166172,J1202,3523Z,1,373,7410,0.268097,0.013495


Step 1bis: cumsum


Unnamed: 0,rome,naf,embauches,embauches_total_n,embauches_total_r,ratio_naf,ratio_rome,cumsum
166157,J1202,4773Z,6066,18976,7410,31.966695,81.862348,18.137652
166174,J1202,9492Z,3,155,7410,1.935484,0.040486,18.097166
166132,J1202,4774Z,34,3894,7410,0.873138,0.458839,17.638327
166043,J1202,4646Z,65,8524,7410,0.762553,0.877193,16.761134
166080,J1202,7731Z,1,177,7410,0.564972,0.013495,16.747638
166079,J1202,7729Z,53,10541,7410,0.502799,0.71525,16.032389
166170,J1202,8720B,3,709,7410,0.423131,0.040486,15.991903
166101,J1202,8610Z,453,131011,7410,0.345772,6.11336,9.878543
166129,J1202,2120Z,42,13627,7410,0.308212,0.566802,9.311741
166172,J1202,3523Z,1,373,7410,0.268097,0.013495,9.298246


Step 2: prior to boost


Unnamed: 0,rome,naf,embauches,embauches_total_n,embauches_total_r,ratio_naf,ratio_rome,cumsum,score_naf,score_rome
166157,J1202,4773Z,6066,18976,7410,31.966695,81.862348,18.137652,4,5
166174,J1202,9492Z,3,155,7410,1.935484,0.040486,18.097166,1,1
166132,J1202,4774Z,34,3894,7410,0.873138,0.458839,17.638327,1,1
166043,J1202,4646Z,65,8524,7410,0.762553,0.877193,16.761134,1,1
166080,J1202,7731Z,1,177,7410,0.564972,0.013495,16.747638,1,1
166079,J1202,7729Z,53,10541,7410,0.502799,0.71525,16.032389,1,1
166170,J1202,8720B,3,709,7410,0.423131,0.040486,15.991903,1,1
166101,J1202,8610Z,453,131011,7410,0.345772,6.11336,9.878543,1,3
166129,J1202,2120Z,42,13627,7410,0.308212,0.566802,9.311741,1,1
166172,J1202,3523Z,1,373,7410,0.268097,0.013495,9.298246,1,1


Step 3


Unnamed: 0,rome,naf,embauches,embauches_total_n,embauches_total_r,ratio_naf,ratio_rome,cumsum,score_naf,score_rome
166157,J1202,4773Z,6066,18976,7410,31.966695,81.862348,18.137652,4,5
166174,J1202,9492Z,3,155,7410,1.935484,0.040486,18.097166,1,1
166132,J1202,4774Z,34,3894,7410,0.873138,0.458839,17.638327,1,1
166043,J1202,4646Z,65,8524,7410,0.762553,0.877193,16.761134,1,1
166080,J1202,7731Z,1,177,7410,0.564972,0.013495,16.747638,1,1
166079,J1202,7729Z,53,10541,7410,0.502799,0.71525,16.032389,1,1
166170,J1202,8720B,3,709,7410,0.423131,0.040486,15.991903,1,1
166101,J1202,8610Z,453,131011,7410,0.345772,6.11336,9.878543,1,3
166129,J1202,2120Z,42,13627,7410,0.308212,0.566802,9.311741,1,1
166172,J1202,3523Z,1,373,7410,0.268097,0.013495,9.298246,1,1


Unnamed: 0,rome,naf,label,score,score_naf,score_rome
0,J1202,4773Z,Commerce de détail de produits pharmaceutiques...,5,4,5
73,J1202,1071B,Cuisson de produits de boulangerie,1,1,1
93,J1202,8899B,Action sociale sans hébergement n.c.a.,1,1,1
94,J1202,7112B,"Ingénierie, études techniques",1,1,1
95,J1202,8531Z,Enseignement secondaire général,1,1,1
96,J1202,8412Z,"Administration publique (tutelle) de la santé,...",1,1,1
97,J1202,2222Z,Fabrication d'emballages en matières plastiques,1,1,1
98,J1202,9602A,Coiffure,1,1,1
99,J1202,7810Z,Activités des agences de placement de main-d'œ...,1,1,1
100,J1202,7111Z,Activités d'architecture,1,1,1
