In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import random
from operator import itemgetter
from sqlalchemy import create_engine

In [2]:
#%matplotlib inline

In [3]:
filename = '../../data_marts/customer_data/uci_dataset/bank-full.csv';
personal_data = pd.DataFrame.from_csv(filename, sep=";", index_col=False)

In [4]:
filename = '../../data_marts/customer_data/us_states_stats.csv';
states_data = pd.DataFrame.from_csv(filename, sep=";", index_col=False)

In [5]:
personal_data2 = personal_data[[0,1,2,3,4,5,6,7]].copy()
personal_data2['income'] = pd.Series(np.zeros(len(personal_data2), dtype=np.int), index=personal_data2.index)
personal_data2['state'] = pd.Series(['' for i in range(len(personal_data2))], index=personal_data2.index)
personal_data2['gender'] = pd.Series(['' for i in range(len(personal_data2))], index=personal_data2.index)
personal_data2.shape[0]

45211

In [6]:
# generate relevant state data
min_st_age = min(states_data['median_age'])
max_st_age = max(states_data['median_age'])
min_st_inc = min(states_data['per_capita_income'])
max_st_inc = max(states_data['per_capita_income'])
min_balance = -500
max_balance = 2000
min_person_age = 25
max_person_age = 55
age_weight = 0.3
income_weight = 0.3

random.seed(413) #412
state_coef = []
for k in range(0, states_data.shape[0]):
    cur_state = states_data[k:k+1].to_dict(orient="records")[0]
    state_age_coef = 2 * (cur_state['median_age']-(min_st_age+max_st_age)/2.0) / (max_st_age - min_st_age)
    state_income_coef = 2 * (cur_state['per_capita_income']-(min_st_inc+max_st_inc)/2.0) / (max_st_inc - min_st_inc)
    state_coef.append({"name": cur_state['State'], "age": state_age_coef, "income": state_income_coef})


for i in range(0, personal_data2.shape[0]):
    row = personal_data2[i:i+1].to_dict(orient="records")[0]
    # trim extremal values
    if row['balance'] < min_balance: row['balance'] = min_balance
    if row['balance'] > max_balance: row['balance'] = max_balance
    if row['age'] < min_person_age: row['age'] = min_person_age
    if row['age'] > max_person_age: row['age'] = max_person_age
        
    # could be in range [-1; +1]
    person_age_coef = 2 * (row['age']-(min_person_age + max_person_age)/2.0) / (max_person_age - min_person_age)
    person_income_coef = 2 * (row['balance']-(min_balance + max_balance)/2.0) / (max_balance - min_balance)
    
    state = []
    for k in range(0, len(state_coef) ):
        bonus_by_age = age_weight * person_age_coef * state_coef[k]['age']
        bonus_by_income = income_weight * person_income_coef * state_coef[k]['income']
        state.append( {"name": state_coef[k]['name'], "value": random.random()+bonus_by_age+bonus_by_income } )
    personal_data2.set_value(i,'state', max(state, key=itemgetter('value'))['name'])    

In [7]:
# generate gender data
min_balance = -500
max_balance = 2000
income_weight = 0.5
base_male_percentage = 0.55

random.seed(414)
for i in range(0, personal_data2.shape[0]):
    row = personal_data2[i:i+1].to_dict(orient="records")[0]
    # trim extremal values
    if row['balance'] < min_balance: row['balance'] = min_balance
    if row['balance'] > max_balance: row['balance'] = max_balance

    # could be in range [-1; +1]
    person_income_coef = 2 * (row['balance']-(min_balance + max_balance)/2.0) / (max_balance - min_balance)
    
    bonus_by_income = income_weight * person_income_coef
    gender = 'F' if (random.random() - bonus_by_income) > base_male_percentage else 'M' 
    
    personal_data2.set_value(i,'gender', gender)


In [8]:
# generate income data
min_balance = -500
max_balance = 2000
random_spread = 20000

random.seed(416)
for i in range(0, personal_data2.shape[0]):
    row = personal_data2[i:i+1].to_dict(orient="records")[0]
    #find state average
    for k in range(0, states_data.shape[0]):
        if states_data['State'][k] == row['state']:
            st_avg_income = states_data['per_capita_income'][k]
            break
    
    # trim extremal values
    if row['balance'] < min_balance: row['balance'] = min_balance
    if row['balance'] > max_balance: row['balance'] = max_balance

    # could be in range [-1; +1]
    person_income_coef = 2 * (row['balance']-(min_balance + max_balance)/2.0) / (max_balance - min_balance)
    
    income = person_income_coef * random_spread * random.random() + st_avg_income
    
    personal_data2.set_value(i,'income', income)

In [9]:
# Person who has loan is credit_card buyer
#personal_data2.rename(columns = {'loan':'credit_card'}, inplace = True)

In [15]:
# generate credit_card buyer
gender_bonus = 0.1
balance_bonus = 0.2
state_bonus = 0.2
card_treshold = 0.7

random.seed(417)
for i in range(0, personal_data2.shape[0]):
    row = personal_data2[i:i+1].to_dict(orient="records")[0]
    bonus = 0.0
    if row['gender'] == 'F': bonus += gender_bonus
    if row['balance'] > 750: bonus += balance_bonus
    if row['state'] > 'M' and row['state'] < 'O': bonus += state_bonus # bonus to M* and N* states

    credit_card = 'yes' if (random.random() + bonus) > card_treshold else 'no'
    personal_data2.set_value(i,'credit_card', credit_card)

In [11]:
# we can think of person who do not have housing loan as of debt_funds buyer
personal_data2.loc[personal_data2['housing']=="yes", 'housing'] = "tmp"
personal_data2.loc[personal_data2['housing']=="no", 'housing'] = "yes"
personal_data2.loc[personal_data2['housing']=="tmp", 'housing'] = "no"
personal_data2.rename(columns = {'housing':'debt_funds'}, inplace = True)

In [16]:
engine = create_engine('postgresql+psycopg2:///pci?user=postgres')
engine.execute("DROP TABLE IF EXISTS pci_customers")
personal_data2.to_sql('pci_customers', engine)

In [13]:
def insert_dict(d, table_name, engine):
    values = ['\''+v+'\'' if isinstance(v, str) else v for v in d.values()]
    command=("insert into " + table_name + " (" + (','.join(['%s']*len(d))) + ") "+
                "values (" + (','.join(['%s']*len(d))) + ");")%tuple(d.keys()+values)
    return engine.execute(command)

def update_dict(d, table_name, engine):
    values = ['\''+v+'\'' if isinstance(v, str) else v for v in d.values()]
    list_of_pairs = [item for pair in  zip(d.keys(),values) for item in pair]
    list_of_pairs = tuple(list_of_pairs)
    
    command=("update " + table_name + " set " + (','.join(['%s=%s']*len(d))) + " where index="+str(d['index']))%list_of_pairs
    return engine.execute(command)

In [14]:
#customer_data = [{
#        'index':2,
#        'name':'George Martin',
#        'user_screen_name':'gmartin',
#        'klout_score':51,
#        'followers_count':786,
#        'got_reply':1,
#        'segment':3
#    },{
#        'index':3,
#        'name':'Katie Hopkins',
#        'user_screen_name':'khopkins',
#        'klout_score':31,
#        'followers_count':315,
#        'got_reply':1,
#        'segment':4
#    }]
#
#for c in customer_data:
#    update_dict(c, 'customers', engine)