In [6]:
### Library Imports
import pandas as pd
### Script Imports
import utils


In [7]:
### Clean Accounts

def clean_account():
    
    ### Import the loans file
    na_values= ['None', '?',]
    df = pd.read_csv('../../project/banking_data/account.csv', sep=';',na_values=na_values)
    
    ### Date conversion
    return utils.date_conversion(df,'date')

In [8]:
def clean_card(test = False):
    ### Import the loans file
    na_values= ['None', '?',]
    if(test):
        df = pd.read_csv('../../project/banking_data/card_test.csv', sep=';',na_values=na_values)
    else:
        df = pd.read_csv('../../project/banking_data/card_train.csv', sep=';',na_values=na_values)

    return utils.date_conversion(df,'date')

In [9]:
def clean_client():
    ### Import the clients file
    na_values= ['None', '?',]
    df = pd.read_csv('../../project/banking_data/client.csv', sep=';',na_values=na_values)
    

    ### Date conversion with creation of gender column
    df = utils.date_conversion_genders(df,'birth_number')
    
    ### Reordering Columns
    cols = df.columns.tolist()
    cols = cols[0:2] + cols[3:4] + cols[2:3]
    return df[cols]

In [10]:
def clean_district():
    ### Import the loans file
    na_values= ['None', '?',]
    df = pd.read_csv('../../project/banking_data/district.csv', sep=';',na_values=na_values)

    ### Replace null values in the '95 column for the ones on the '96 column
    districts = df.copy()
    districts["no. of commited crimes \'95 "] = districts["no. of commited crimes \'95 "].combine_first(districts["no. of commited crimes \'96 "])
    districts["unemploymant rate \'95 "] = districts["unemploymant rate \'95 "].combine_first(districts["unemploymant rate \'96 "])
    
    ### Added columns for crime analysis
    districts['crime_growth'] = districts["no. of commited crimes '96 "] - districts["no. of commited crimes '95 "]
    districts['total_crime'] = districts["no. of commited crimes '96 "] + districts["no. of commited crimes '95 "]

    ### Added column for uneployment growth analysis
    districts['unemploymant_growth'] = districts["unemploymant rate '96 "] - districts["unemploymant rate '95 "]

    ### Reorder columns
    cols = districts.columns.tolist()
    cols = cols[:13] + cols[-1:] + cols[13:-1]

    return districts[cols]    
    

In [11]:
def clean_loans(test=False):
    ### Import the loans file
    na_values= ['None', '?',]
    if(test):
        df = pd.read_csv('../../project/banking_data/loan_test.csv', sep=';',na_values=na_values)
    else:
        df = pd.read_csv('../../project/banking_data/loan_train.csv', sep=';',na_values=na_values)
    
    return utils.date_conversion(df,'date')

In [13]:
def clean_trans(test = False):
    ### Import the trans file
    
    na_values= ['None', '?',]
    
    if test:
        df = pd.read_csv('../../project/banking_data/trans_test.csv', sep=';',na_values=na_values)
    else:
        df = pd.read_csv('../../project/banking_data/trans_train.csv', sep=';',na_values=na_values)
    


    ### Convert date format
    trans = utils.date_conversion(df,'date')

    ### Drop columns with too many missing values
    trans = utils.drop_null_columns(trans,0.4)

    ### Replace null values from the operation column for the most common occurence
    trans['operation'] = trans['operation'].fillna(trans['operation'].value_counts().idxmax())
    
    return trans

In [None]:
### Merge Data


def get_final_df(test=False):
    
    accounts = clean_account()
    cards = clean_card(test)
    clients = clean_client()
    districts = clean_district()
    loans = clean_loans(test)
    trans = clean_trans(test)
    disp = pd.read_csv('../../project/banking_data/disp.csv', sep=';')
    
    ### TODO:DO a New Merge with everything
    # ### Merge everything with sequel queries
    # df = loans.copy()

    # ### Merge accounts
    # df = df.merge(accounts, how='left', on="account_id")
    # df = df.rename(columns={"date_x": "loan_date", "date_y" : "account_date"})

    # ### Merge Districts 
    # distrs = districts.rename(columns={"code ": "district_id"})
    # df = df.merge(distrs, how='left', on="district_id")

    # ### Merge Clients
    # # clts = clients.drop(columns=['district_id'])
    # df = df.merge(clients, how='left', on=["district_id"])

    # ## Merge transactions
    # trans2 = trans.rename(columns={"type": "trans_type" , "amount":"trans_ammount"})
    # df = df.merge(trans2, how='left', on="account_id")


    # ### Create copy
    # df = loans.copy()

    # ### Merge accounts
    # df = df.merge(accounts, how='left', on="account_id")
    # df = df.rename(columns={"date_x": "loan_date", "date_y" : "account_date"})

    # ### Merge Disp
    # ### Mover para cleanup
    # q1 = "SELECT * FROM disp WHERE Type='OWNER'"
    # disp = ps.sqldf(q1)
    # df = df.merge(disp, how='left', on="account_id")

    # ### Merge Clients
    # clts = clients.drop(columns=['district_id'])
    # df = df.merge(clts, how='left', on="client_id")

    # ### Merge Districts 
    # distrs = districts.rename(columns={"code ": "district_id"})
    # df = df.merge(distrs, how='left', on="district_id")

    # ### Merge transactions
    # trans2 = trans.rename(columns={"type": "trans_type" , "amount":"trans_ammount"})
    # df = df.merge(trans2, how='left', on="account_id")

    # # ### Merge Card
    # # ## TODO: Rename issue column to date_something
    # # card = card.rename(columns={"type": "type_card"})
    # # df = df.merge(card, how='left', on="disp_id")
    
    return df