# Data Cleaning

In [3]:
import pandas as pd
import numpy as np
import os
import re

## Délinquance

In [46]:
df = pd.read_csv('../data/delinquance.csv', index_col=0)

In [47]:
df.head()

Unnamed: 0,ville,lien,Violences aux personnes,Vols et dégradations,Délinquance économique et financière,Autres crimes et délits,Violences gratuites,Violences crapuleuses,Violences sexuelles,Menaces de violence,...,Différends familiaux,Proxénétisme,Ports ou détentions d'arme prohibée,Recels,Délits des courses et jeux d'argent,Délits liés aux débits de boisson et de tabac,Atteintes à l'environnement,Délits liés à la chasse et la pêche,Cruauté et délits envers les animaux,Atteintes aux intérêts fondamentaux de la Nation
0,Aast(64460),/management/ville/aast/ville-64001,343.0,1480.0,9.0,174.0,184.0,14.0,35.0,64.0,...,54.0,0.0,14.0,17.0,0.0,1.0,1.0,0.0,5.0,3.0
1,,,178.0,563.0,6.0,75.0,81.0,3.0,23.0,38.0,...,26.0,0.0,3.0,11.0,0.0,0.0,0.0,0.0,2.0,1.0
2,Arces(17120),/management/ville/arces/ville-17015,393.0,1839.0,25.0,226.0,201.0,22.0,48.0,79.0,...,88.0,0.0,14.0,25.0,0.0,1.0,4.0,1.0,9.0,0.0
3,Arue(40120),/management/ville/arue/ville-40014,220.0,1089.0,5.0,170.0,111.0,9.0,26.0,40.0,...,42.0,0.0,10.0,18.0,0.0,0.0,2.0,7.0,3.0,1.0
4,Aujols(46090),/management/ville/aujols/ville-46010,154.0,845.0,9.0,134.0,78.0,8.0,21.0,30.0,...,37.0,0.0,7.0,4.0,0.0,0.0,5.0,0.0,4.0,1.0


In [48]:
df.dtypes

ville                                                object
lien                                                 object
Violences aux personnes                             float64
Vols et dégradations                                float64
Délinquance économique et financière                float64
Autres crimes et délits                             float64
Violences gratuites                                 float64
Violences crapuleuses                               float64
Violences sexuelles                                 float64
Menaces de violence                                 float64
Atteintes à la dignité                              float64
Cambriolages                                        float64
Vols à main armée (arme à feu)                      float64
Vols avec entrée par ruse                           float64
Vols liés à l'automobile                            float64
Vols de particuliers                                float64
Vols d'entreprises                      

In [52]:
# Replace nc to NaN 
for col in list(df.columns):
    if col != 'ville' and col != 'lien':
        df.loc[df[col].astype('str').str.contains('nc', na=False), col] = np.nan

In [None]:
# Remove space between number
for col in list(df.columns):
    if col != 'ville' and col != 'lien':
        df[col] = df[col].astype(str).str.replace('\s','')

In [53]:
# Convert dtype to float
for col in list(df.columns):
    if col != 'ville' and col != 'lien':
        df[col] = df[col].astype('float')

In [None]:
df.to_csv('../data/delinquance.csv')

## Cleaning All

Load Data

In [2]:
DATA_DIR = '../data/'
data_files = os.listdir(DATA_DIR)
dataframes = list(data_files)

In [3]:
dataframes

['csp.csv',
 '.DS_Store',
 'delinquance.csv',
 'sante_social.csv',
 'infos.csv',
 'immobilier.csv',
 'automobile.csv',
 'emploi.csv',
 'salaires.csv',
 '.ipynb_checkpoints',
 'liens_villes.csv',
 'demographie.csv',
 'entreprises.csv']

Ignore Files

In [78]:
dataframes.remove('.DS_Store')
dataframes.remove('.ipynb_checkpoints')
dataframes.remove('liens_villes.csv')
dataframes.remove('infos.csv')

In [1]:
def clean_df(file):
    df = pd.read_csv(f'../data/{file}', index_col=0)
    cols = list(df.columns)

    for col in cols:
        if col != 'ville' and col != 'lien' and col:
            # Replace nc to NaN 
            df.loc[df[col].astype('str').str.contains('nc', na=False), col] = np.nan
            # Remove space between number 
            df[col] = df[col].astype(str).str.replace('\s','')
            try:
                # Convert dtype to float
                df[col] = df[col].astype('float')
            except:
                pass

    # Remove Unnamed cols
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    # Save cleaned data
    df.to_csv(f'../data/{file}')

In [6]:
def start_cleaning(files):
    
    for file in files:
        if file not in ['.DS_Store','.ipynb_checkpoints','liens_villes.csv','infos.csv']
        print(file)
        df = pd.read_csv(f'../data/{file}', index_col=0)
        df = df.rename(columns={'Ville': 'ville', 'Lien': 'lien'})
        cols = list(df.columns)
        
        for col in cols:
            if col != 'ville' and col != 'lien' and col:
                # Replace nc to NaN 
                df.loc[df[col].astype('str').str.contains('nc', na=False), col] = np.nan
                # Remove space between number 
                df[col] = df[col].astype(str).str.replace('\s','')
                try:
                    # Convert dtype to float
                    df[col] = df[col].astype('float')
                except:
                    pass
        
        # Remove Unnamed cols
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        # Save cleaned data
        df.to_csv(f'../data/{file}')

In [2]:
clean_df('sante_social.csv')

NameError: name 'pd' is not defined