In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)



from utils import normalize


In [5]:
path = 'base_etablissement_par_tranche_effectif.csv'

df = pd.read_csv(path)

df.columns = [normalize(c) for c in df.columns]

df.head()

Unnamed: 0,codgeo,libgeo,reg,dep,e14tst,e14ts0nd,e14ts1,e14ts6,e14ts10,e14ts20,e14ts50,e14ts100,e14ts200,e14ts500
0,1001,L'Abergement-Clémenciat,82,1,25,22,1,2,0,0,0,0,0,0
1,1002,L'Abergement-de-Varey,82,1,10,9,1,0,0,0,0,0,0,0
2,1004,Ambérieu-en-Bugey,82,1,996,577,272,63,46,24,9,3,2,0
3,1005,Ambérieux-en-Dombes,82,1,99,73,20,3,1,2,0,0,0,0
4,1006,Ambléon,82,1,4,4,0,0,0,0,0,0,0,0


## Exclusion de la Corse

In [6]:
df = df[df['codgeo'].astype(str).apply(lambda x: ('A' not in x) and ('B' not in x))]

In [7]:
df.head()

Unnamed: 0,codgeo,libgeo,reg,dep,e14tst,e14ts0nd,e14ts1,e14ts6,e14ts10,e14ts20,e14ts50,e14ts100,e14ts200,e14ts500
0,1001,L'Abergement-Clémenciat,82,1,25,22,1,2,0,0,0,0,0,0
1,1002,L'Abergement-de-Varey,82,1,10,9,1,0,0,0,0,0,0,0
2,1004,Ambérieu-en-Bugey,82,1,996,577,272,63,46,24,9,3,2,0
3,1005,Ambérieux-en-Dombes,82,1,99,73,20,3,1,2,0,0,0,0
4,1006,Ambléon,82,1,4,4,0,0,0,0,0,0,0,0


In [8]:
df['codgeo'] = df['codgeo'].astype(int)

## Exclusion des DOMTOM

In [9]:
df = df[df['codgeo'] < 96000]

In [10]:
df.head()

Unnamed: 0,codgeo,libgeo,reg,dep,e14tst,e14ts0nd,e14ts1,e14ts6,e14ts10,e14ts20,e14ts50,e14ts100,e14ts200,e14ts500
0,1001,L'Abergement-Clémenciat,82,1,25,22,1,2,0,0,0,0,0,0
1,1002,L'Abergement-de-Varey,82,1,10,9,1,0,0,0,0,0,0,0
2,1004,Ambérieu-en-Bugey,82,1,996,577,272,63,46,24,9,3,2,0
3,1005,Ambérieux-en-Dombes,82,1,99,73,20,3,1,2,0,0,0,0
4,1006,Ambléon,82,1,4,4,0,0,0,0,0,0,0,0


## Suppression des colonnes inutiles

In [11]:
df = df[[c for c in df.columns if c not in ['reg', 'dep']]]
df.head()

Unnamed: 0,codgeo,libgeo,e14tst,e14ts0nd,e14ts1,e14ts6,e14ts10,e14ts20,e14ts50,e14ts100,e14ts200,e14ts500
0,1001,L'Abergement-Clémenciat,25,22,1,2,0,0,0,0,0,0
1,1002,L'Abergement-de-Varey,10,9,1,0,0,0,0,0,0,0
2,1004,Ambérieu-en-Bugey,996,577,272,63,46,24,9,3,2,0
3,1005,Ambérieux-en-Dombes,99,73,20,3,1,2,0,0,0,0
4,1006,Ambléon,4,4,0,0,0,0,0,0,0,0


## Renommage des colonnes

In [13]:
df = df.rename(
    columns={
        'e14tst': 'nb_ent', 
        'e14ts0nd': 'nb_ent_0',
        'e14ts1': 'nb_ent_1-5',
        'e14ts6': 'nb_ent_6-10',
        'e14ts10': 'nb_ent_11-20',
        'e14ts20': 'nb_ent_21-50',
        'e14ts50': 'nb_ent_51-99',
        'e14ts100': 'nb_ent_100-199',
        'e14ts200': 'nb_ent_200-499',
        'e14ts500': 'nb_ent_500+'
    }
)
df.head()

Unnamed: 0,codgeo,libgeo,nb_ent,nb_ent_0,nb_ent_1-5,nb_ent_6-10,nb_ent_11-20,nb_ent_21-50,nb_ent_51-99,nb_ent_100-199,nb_ent_200-499,nb_ent_500+
0,1001,L'Abergement-Clémenciat,25,22,1,2,0,0,0,0,0,0
1,1002,L'Abergement-de-Varey,10,9,1,0,0,0,0,0,0,0
2,1004,Ambérieu-en-Bugey,996,577,272,63,46,24,9,3,2,0
3,1005,Ambérieux-en-Dombes,99,73,20,3,1,2,0,0,0,0
4,1006,Ambléon,4,4,0,0,0,0,0,0,0,0


## Gestion des villes avec arrondissements

In [14]:
df.loc[df['libgeo'] == 'Paris', 'codgeo'] = 75000
df.loc[df['libgeo'] == 'Marseille', 'codgeo'] = 13000
df.loc[df['libgeo'] == 'Lyon', 'codgeo'] = 69000


In [15]:
df.to_csv('clean_data/entreprises_clean.csv', index=False)