### Data Cleaning

In [8]:
#Libraries
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [9]:
#Read initial file
df = pd.read_csv("../data/data_tot.csv", encoding='latin1')

In [10]:
def proper_datetime():
    '''Get a functional datetime field, the one the DF comes with isn't ideal'''
    df['DATETIME'] = pd.to_datetime(df.DATUM+df.TIJD, format='%Y-%m-%d\'%H%M')
def fix_order(df):
    '''We want the datetime field up front'''
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    return df

In [11]:
def drop_bad_columns(df):
    '''Get rid of columns that contain bad or irrelevant data'''
    df.drop(columns=['Unnamed: 0',
                     'knmi_STN',
                    'DATUM', 
                    'TIJD', 
                    'DOM', 
                    'BEW', 
                    'SGK', 
                    'ORG', 
                    'IVS', 
                    'BTNOMS',
                    'BTXCOD',
                     'BTXOMS',
                     'GBDOMS',
                     'OGIOMS',
                     'ANIOMS',
                     'BHIOMS',
                     'BMIOMS',
                     'VATOMS',
                    'LOC:TYPE', 
                    'SYS', 
                    'SYSOMS', 
                    'TYP', 
                    'TYPOMS', 
                    'TYD:BEGINDAT',
                    'TYD:BEGINTYD',
                    'TYD:EINDDAT',
                    'TYD:EINDTYD',
                    'STA:BEGINDAT',
                    'STA:BEGINTYD',
                    'STA:EINDDAT',
                    'STA:EINDTYD',
                    'STA:RKSSTATUS', 
                    'EXTCODE', 
                    'BRON', 
                    'ORGOMS',
                    'IVSOMS',
                    'is_PAK', 'ID'], inplace=True)
    return df

In [12]:
def filter_messy_observations(df):
    '''Filter out rows with bad observations, and rewrite tsome of the columns'''
    df = df[df['KWC'].isin([0,6])]
    df.loc[:,'BTX'] = df.loc[:,'BTXOMS']
    df.loc[:,'GBD'] = df.loc[:,'GBDOMS']
    df.loc[:,'OGI'] = df.loc[:,'OGIOMS']
    df.loc[:,'ANI'] = df.loc[:,'ANIOMS']
    df.loc[:,'BHI'] = df.loc[:,'BHIOMS']
    df.loc[:,'BMI'] = df.loc[:,'BMIOMS']
    df.loc[:,'VAT'] = df.loc[:,'VATOMS']
    return df

In [13]:
def clean_my_df(df):
    """Combination of several steps to clean the DF"""
    proper_datetime()
    df = filter_messy_observations(df)
    df= drop_bad_columns(df)
    df = fix_order(df)
    df.drop(columns=df.columns[-40:], inplace=True)
    return df


In [14]:
#Perform all necessary cleaning steps. 
df = clean_my_df(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [15]:
#All done - write to file
df.to_csv("../data/data_clean.csv")