In [1]:
import sklearn.preprocessing
import pandas as pd
import numpy as np


In [2]:
files = ["data/01_personal_data.csv", "data/02_bank_comm_data.csv", "data/03_bank_fin_data.csv", 
         "data/04_bank_nbo_model_data.csv", "data/05_bank_cltv_model_data.csv", "data/06_bank_external_data.csv",
        "data/07_bank_transaction_data.csv", "data/08_bank_behavior_data.csv", "data/09_cluster_data.csv"]
frames = [ pd.read_csv(f, index_col=0) for f in files ]

In [3]:
# Correct indicies for files  07 .. 09
for i in range(6, 9):
    frames[i].index = frames[i].index + 1

In [4]:
data_joined = pd.concat(frames, axis=1)
del frames # explicitly free memory (actual for notebook)

In [5]:
# Generate simulated sentiment and days_passed (independed from other variables)
sentiment = pd.Series(np.random.random_integers(-5, high=5, size=len(data_joined)))
sentiment.name = "Sentiment"
sentiment.index = sentiment.index + 1
days_passed = pd.Series(np.random.random_integers(1, high=365, size=len(data_joined)))
days_passed.name = "Sentiment_days_passed"
days_passed.index = days_passed.index + 1

In [6]:
data_joined = pd.concat([data_joined, sentiment, days_passed], axis=1)

In [7]:
# Generate simulated event
cor_data = [("is_maried", 1), ("age", 1), ("work_experience", 1), ("children_number", 1), ("has_mortgage", 1),
            ("loan_products_nii", 1), ("checking_account_balance", 1), ("deposit_balance", 1), ("money_market_balance", 1),
            ("investable_asset_indicator", 1), ("Sentiment", 1), ("Sentiment_days_passed", 1)]

#initialize with double-wighted Sentiment data
newVar=2.0 * sklearn.preprocessing.scale(data_joined["Sentiment"].astype(float))

#add all corelated values
for i,weight in cor_data:
    newVar = newVar + weight * sklearn.preprocessing.scale(data_joined[i].astype(float))

In [8]:
# Add random error for events
newEvent = newVar + 2.0*np.random.randn(len(newVar) )
partition = 0.4
border = partition*max(newEvent) + (1.0-partition)*min(newEvent)
Events=pd.Series((newEvent>border).astype(int))
Events.name = "Event"
Events.index = Events.index + 1

In [9]:
# Add random error for Day_Observed
newDO = newVar + 2.0*np.random.randn(len(newVar) )
Day_observed = (newDO - min(newDO))/(max(newDO)-min(newDO))
Day_observed = pd.Series((365-Day_observed*275).astype(int))
Day_observed.name = "Day_Observed"
Day_observed.index = Day_observed.index + 1

In [10]:
data_joined = pd.concat([data_joined, Events, Day_observed], axis=1)

In [11]:
# Add Next Best Offer simulated variables
# each tuple has existent variable name and its correlation coefficients for all five NBO Variables
# "Investment_Discount", "HI_Deposit", "HI_MM", "Premium_account", "Refuses_all"
Offers_cor_data = [("is_maried", -1, 1, 1, 1, 0.5), ("age", -1, 1, 1, 1, 0.5), ("work_experience", -1, 1, 1, 1, 0.5), 
             ("children_number", 1, -1, 1, 1, 1), ("has_mortgage", 1, -1, 1, 1, 1), ("loan_products_nii", 1, -1, 1, 1, 0.5),
             ("checking_account_balance", 1, 1, -1, 1, 1), ("deposit_balance", 1, 1, -1, 1, 1), 
             ("money_market_balance", 1, 1, -1, 1, 1), ("investable_asset_indicator", 1, 1, 1, -1, 1), 
             ("Sentiment", 1, 1, 1, -1, 1), ("Sentiment_days_passed", 1, 1, 1, -1, 1)]

#initialize value
NBO_Vars = np.zeros((5, len(data_joined)))

#add all corelated values
for i,w0,w1,w2,w3,w4 in Offers_cor_data:
    NBO_Vars[0] = NBO_Vars[0] + w0*sklearn.preprocessing.scale(data_joined[i].astype(float))
    NBO_Vars[1] = NBO_Vars[1] + w1*sklearn.preprocessing.scale(data_joined[i].astype(float))
    NBO_Vars[2] = NBO_Vars[2] + w2*sklearn.preprocessing.scale(data_joined[i].astype(float))
    NBO_Vars[3] = NBO_Vars[3] + w3*sklearn.preprocessing.scale(data_joined[i].astype(float))
    NBO_Vars[4] = NBO_Vars[4] + w4*sklearn.preprocessing.scale(data_joined[i].astype(float))

# Add some randomness
NBO_Vars[0] = NBO_Vars[0] + 1*np.random.randn(len(NBO_Vars.T) )
NBO_Vars[1] = NBO_Vars[1] + 1*np.random.randn(len(NBO_Vars.T) )
NBO_Vars[2] = NBO_Vars[2] + 1*np.random.randn(len(NBO_Vars.T) )
NBO_Vars[3] = NBO_Vars[3] + 1*np.random.randn(len(NBO_Vars.T) )
NBO_Vars[4] = NBO_Vars[4] + 1*np.random.randn(len(NBO_Vars.T) )

In [12]:
#NBO_df = pd.DataFrame(NBO_Vars.T, columns=["Investment_Discount", "HI_Deposit", "HI_MM", "Premium_account", "Refuses_all"])
NBO_df = pd.DataFrame(np.argmax(NBO_Vars, axis=0), columns=["NBO_choosen"])
NBO_df.index = NBO_df.index + 1
data_joined = pd.concat([data_joined, NBO_df], axis=1)

In [13]:
data_joined.to_csv("data/merged_dataset_python.csv") # optionally: na_rep="NA"

In [14]:
val, count = np.unique(np.argmax(NBO_Vars, axis=0), return_counts=True)
count = 100.0*count/len(NBO_Vars[0])
print val, count

[0 1 2 3 4] [ 29.8048  20.4158  26.266   19.2113   4.3021]
