In [279]:
import pandas as pd
import pickle as pkl
import numpy as np

from collections import defaultdict

from sklearn.model_selection import train_test_split

In [2]:
from fastai.basic_train import load_learner

In [3]:
# pull dataframe for categorical variables, convert to entity embedding using model weights. e.g. SKU1_embed
# combine grouped columns. i.e. SKU1_embed, SKU2_embed, etc. into SKU_embed
# drop original categorical variables
# concat numerical and embeded columns

# to convert categorical values to their embedded vectors
# convert categorical value -> index using cat_dict, then check the LUT (embed_map) to find the embedded vector

In [4]:
# for the numerical variables, we want to take the values from inner_train_df, because in the process of 
# creating the dataset for entity embedding training, fastai normalized the numerical values
# making them easier to works with durint neural network training

In [5]:
# pull dataframe for categorical variables
order_category_df = pd.read_csv('ivr_backup_data/order_category_df.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [70]:
order_category_df.shape

(244134, 51)

In [7]:
columns = order_category_df.columns.tolist()    # only thing we need order_category_df for are the column names

In [8]:
columns

['MODEL1',
 'MAKE2',
 'BILLING_STATE',
 'TYPE3',
 'TYPE2',
 'BLACKLIST_EMAIL',
 'YELLOW_SKU',
 'MODEL4',
 'TPF_HIGH_YELLOW',
 'ACCOUNT_NUMBER',
 'EXTERNAL_CREDIT_CHECK_DONE',
 'IS_EXISTING_CUSTOMER',
 'FRAUDNET_RESULT',
 'MAKE4',
 'LINE_LIMIT_REACHED',
 'MAKE1',
 'IDA_RESULT',
 'EXPERIAN_RESPONSE_1',
 'EXPERIAN_RESPONSE_2',
 'MODEL2',
 'TYPE1',
 'SHIPPING_STATE',
 'SALES_CHANNEL',
 'TYPE4',
 'MAKE5',
 'MODEL5',
 'DEVICE_AT_HOME',
 'SSN_BLACKLISTED',
 'MAKE3',
 'TYPE5',
 'DENIAL_OF_SERVICES',
 'FPF_HIGH_YELLOW',
 'SHIPPINGMETHOD',
 'INTERNAL_CREDIT_CHECK_DONE',
 'ORDER_EXCHANGE',
 'MODEL3',
 'SKU1',
 'SKU2',
 'SKU3',
 'SKU4',
 'SKU5',
 'BYOD1',
 'BYOD2',
 'BYOD3',
 'BYOD4',
 'BYOD5',
 'CARRIER1',
 'CARRIER2',
 'CARRIER3',
 'CARRIER4',
 'CARRIER5']

In [9]:
# order_category_df.head()

In [10]:
# pull out the entity embedding map
path = '/mnt/azmnt/code/Users/bho829/IVR/order'
learn = load_learner(path)

with open('ivr_backup_data/cat_dict.pkl', 'rb') as cat_dict_file:
    cat_dict = pkl.load(cat_dict_file)

inner_train_df = pd.read_csv('ivr_backup_data/inner_train_df.csv')

In [11]:
inner_train_df.head()
# embeddings

Unnamed: 0,IDA_RESULT,MAKE1,EXPERIAN_RESPONSE_1,SHIPPINGMETHOD,MAKE4,MAKE5,MODEL5,EXPERIAN_RESPONSE_2,ACCOUNT_NUMBER,FRAUDNET_RESULT,...,PREORDER_LINES,ALLOWED_LINES,APPROVED_LINES,EXTERNAL_APPROVED_LINES,FIRST_PARTY_ID_SCORE,THIRD_PARTY_ID_SCORE,PRICE1,PRICE2,ORDER_CREATED_DATE,ACTIVE_WATCH_RESULT
0,GREEN,Apple,,2Days,,,,,2608741079,,...,-0.193577,-0.128534,-0.751217,1.7287,0.25469,0.699868,-0.180481,-0.36777,0.60058,1
1,GREEN,Apple,,2Days,,,,,2541993372,,...,-0.193577,0.70618,-0.751217,-0.885189,0.465524,0.708396,1.040141,-0.36777,0.318066,1
2,GREEN,Apple,,2Days,Apple,,,,2676329899,,...,-0.193577,0.70618,1.383215,-0.885189,-1.056148,-0.630464,-0.180481,1.138291,0.547005,3
3,GREEN,Apple,,2Days,,,,,1442915580,,...,-0.193577,0.70618,1.383215,-0.885189,0.034689,0.009119,-1.672325,-0.36777,0.663532,1
4,GREEN,Apple,,2Days,,,,,1368037653,,...,-0.193577,-1.797961,-0.751217,0.683144,-0.588646,-0.852186,-0.45173,-0.36777,0.031195,1


In [12]:
inner_train_df.shape

(239276, 82)

In [13]:
# user inner_train_df indices to select the rows from order_category_df that were used in the train dataset
# don't use values from inner_train_df directly, because they may have been normalized
train_category_df = order_category_df.loc[inner_train_df.index]

In [14]:
train_category_df.shape

(239276, 51)

In [15]:
cat_key_map = defaultdict(dict)
for cat_var in columns:
    var_categories = train_category_df[cat_var].astype('category').cat.as_ordered()
    cat_keys = np.concatenate([['#na#'],var_categories.cat.categories.values])
    key_index = 0
    for key_value in cat_keys:
        cat_key_map[cat_var].update({key_value: key_index})
        key_index += 1


In [16]:
# cat_key_map['BILLING_STATE']

In [17]:
cat_var = columns[0]
cat_var

'MODEL1'

In [18]:
embed_map = defaultdict(dict)
for cat_var in columns:
    embed_index = cat_dict[cat_var]
    cat_weights = learn.model.embeds[embed_index].weight.data.cpu().numpy()
    for index in range(cat_weights.shape[0]):
        embed_map[cat_var].update({index:cat_weights[index]})

In [19]:
embed_map['MODEL1']

{0: array([ 4.18e-43,  3.03e-43, -3.62e-43,  2.96e-43, -3.77e-43,  3.76e-43,  2.84e-43,  3.83e-43,  3.01e-43,  3.28e-43,
         4.12e-43, -3.12e-43, -3.29e-43,  3.83e-43,  3.70e-43, -4.37e-43,  3.70e-43], dtype=float32),
 1: array([ 6.667283e-05, -9.487361e-02, -1.147027e-01,  3.099865e-02,  8.956190e-02, -1.579541e-01,  5.117259e-02,
         1.191606e-01,  6.121391e-02,  2.169255e-01, -1.810426e-01,  1.205197e-02, -8.267474e-02,  4.906839e-02,
        -5.681212e-02, -1.687244e-01,  2.807344e-01], dtype=float32),
 2: array([-0.101972, -0.020438, -0.184858, -0.432412,  0.24198 , -0.086668, -0.183148,  0.213148, -0.066549,  0.002132,
        -0.13719 , -0.349538, -0.026312,  0.027046, -0.055758, -0.232032, -0.047163], dtype=float32),
 3: array([-0.160554,  0.018824, -0.197617, -0.161394,  0.202792, -0.12929 ,  0.001676,  0.143228, -0.068347, -0.018393,
        -0.14964 , -0.160988, -0.077367, -0.009688,  0.011377, -0.167885,  0.071836], dtype=float32),
 4: array([-8.399764e-02,  1.555

In [20]:
# train_category_df[cat_var] = train_category_df[cat_var].astype('category').cat.as_ordered()
# cat_keys = np.concatenate([['#na#'],train_category_df[cat_var].cat.categories.values])


In [21]:
def convert_to_embed(cat_var, cat_value):
    if cat_value =='nan' :
        cat_index = 0
    else:
        try:
            cat_index = cat_key_map[cat_var][cat_value]
        except:
            cat_index = 0

    embedded = embed_map[cat_var][cat_index]
    
    return embedded

In [22]:
category_embed_df = train_category_df.copy()

In [23]:
# these are the grouped fields. They start with 
grouped = ['SKU','BYOD','CARRIER','MODEL','TYPE','MAKE']

In [24]:
def in_group(cat_var):
    for grouped_prefix in grouped:
        if cat_var.startswith(grouped_prefix):
            return True
    return False

In [25]:
single_cats = [cat_var for cat_var in columns if not in_group(cat_var)]

In [26]:
single_cats

['BILLING_STATE',
 'BLACKLIST_EMAIL',
 'YELLOW_SKU',
 'TPF_HIGH_YELLOW',
 'ACCOUNT_NUMBER',
 'EXTERNAL_CREDIT_CHECK_DONE',
 'IS_EXISTING_CUSTOMER',
 'FRAUDNET_RESULT',
 'LINE_LIMIT_REACHED',
 'IDA_RESULT',
 'EXPERIAN_RESPONSE_1',
 'EXPERIAN_RESPONSE_2',
 'SHIPPING_STATE',
 'SALES_CHANNEL',
 'DEVICE_AT_HOME',
 'SSN_BLACKLISTED',
 'DENIAL_OF_SERVICES',
 'FPF_HIGH_YELLOW',
 'SHIPPINGMETHOD',
 'INTERNAL_CREDIT_CHECK_DONE',
 'ORDER_EXCHANGE']

In [36]:
single_cats.remove('ACCOUNT_NUMBER')   # don't carry ACCOUNT_NUMBER into the input data set

In [27]:
embed_var = 'MODEL1'

In [28]:
categorical_embed_df = pd.DataFrame()

In [29]:
for single_var in single_cats:
    print('single_var={}'.format(single_var))
    embed_var = single_var + '_embed'
    categorical_embed_df[embed_var] = train_category_df[single_var].apply(lambda x: convert_to_embed(single_var, str(x)))

single_var=BILLING_STATE
single_var=BLACKLIST_EMAIL
single_var=YELLOW_SKU
single_var=TPF_HIGH_YELLOW
single_var=ACCOUNT_NUMBER
single_var=EXTERNAL_CREDIT_CHECK_DONE
single_var=IS_EXISTING_CUSTOMER
single_var=FRAUDNET_RESULT
single_var=LINE_LIMIT_REACHED
single_var=IDA_RESULT
single_var=EXPERIAN_RESPONSE_1
single_var=EXPERIAN_RESPONSE_2
single_var=SHIPPING_STATE
single_var=SALES_CHANNEL
single_var=DEVICE_AT_HOME
single_var=SSN_BLACKLISTED
single_var=DENIAL_OF_SERVICES
single_var=FPF_HIGH_YELLOW
single_var=SHIPPINGMETHOD
single_var=INTERNAL_CREDIT_CHECK_DONE
single_var=ORDER_EXCHANGE


In [30]:
# combine group variables by simple addition
# always use xxx1 as the weight map. because it is the most complete
# if xxxn has values not in xxx1, use index=0
for group_var in grouped:
    base_var = group_var + str(1)
    for group_index in range(5):
        var = group_var + str(group_index + 1)
        categorical_embed_df[var] = train_category_df[var].apply(lambda x: convert_to_embed(base_var, str(x)))

In [31]:
categorical_embed_df['SKU2'].head()

0    [-0.016524423, -0.02465348, 0.0017778829, -0.0...
1    [3.22e-43, 4.25e-43, 3.74e-43, -3.53e-43, 3.57...
2    [3.22e-43, 4.25e-43, 3.74e-43, -3.53e-43, 3.57...
3    [3.22e-43, 4.25e-43, 3.74e-43, -3.53e-43, 3.57...
4    [3.22e-43, 4.25e-43, 3.74e-43, -3.53e-43, 3.57...
Name: SKU2, dtype: object

In [32]:
def sum_embedded(group_list):
    group_sum = np.sum(group_list, axis=0)
    return group_sum

In [33]:
for group_var in grouped:
    group_var_embed = group_var + '_embed'
    categorical_embed_df[group_var_embed] = categorical_embed_df.apply(lambda x: sum_embedded([x[group_var + str(1)],
                                                                x[group_var + str(2)],
                                                                x[group_var + str(3)],
                                                                x[group_var + str(4)],
                                                                x[group_var + str(5)] ]), axis=1)


In [39]:
# categorical_embed_df.columns

In [40]:
# categorical_embed_df['ACCOUNT_NUMBER_embed'].head()

In [41]:
categorical_embed_df.drop(columns=['ACCOUNT_NUMBER_embed'], inplace=True)

In [180]:
categorical_embed_df.iloc[0]['SKU_embed'].shape

(54,)

In [42]:
categorical_embed_df.columns

Index(['BILLING_STATE_embed', 'BLACKLIST_EMAIL_embed', 'YELLOW_SKU_embed',
       'TPF_HIGH_YELLOW_embed', 'EXTERNAL_CREDIT_CHECK_DONE_embed',
       'IS_EXISTING_CUSTOMER_embed', 'FRAUDNET_RESULT_embed',
       'LINE_LIMIT_REACHED_embed', 'IDA_RESULT_embed',
       'EXPERIAN_RESPONSE_1_embed', 'EXPERIAN_RESPONSE_2_embed',
       'SHIPPING_STATE_embed', 'SALES_CHANNEL_embed', 'DEVICE_AT_HOME_embed',
       'SSN_BLACKLISTED_embed', 'DENIAL_OF_SERVICES_embed',
       'FPF_HIGH_YELLOW_embed', 'SHIPPINGMETHOD_embed',
       'INTERNAL_CREDIT_CHECK_DONE_embed', 'ORDER_EXCHANGE_embed', 'SKU1',
       'SKU2', 'SKU3', 'SKU4', 'SKU5', 'BYOD1', 'BYOD2', 'BYOD3', 'BYOD4',
       'BYOD5', 'CARRIER1', 'CARRIER2', 'CARRIER3', 'CARRIER4', 'CARRIER5',
       'MODEL1', 'MODEL2', 'MODEL3', 'MODEL4', 'MODEL5', 'TYPE1', 'TYPE2',
       'TYPE3', 'TYPE4', 'TYPE5', 'MAKE1', 'MAKE2', 'MAKE3', 'MAKE4', 'MAKE5',
       'SKU_embed', 'BYOD_embed', 'CARRIER_embed', 'MODEL_embed', 'TYPE_embed',
       'MAKE_embed'],

In [43]:
for group_var in grouped:
#     print(group_var)
    categorical_embed_df.drop(columns=[group_var + str(1),group_var + str(2),group_var + str(3),group_var + str(4),group_var + str(5)], inplace=True)

In [44]:
categorical_embed_df.columns

Index(['BILLING_STATE_embed', 'BLACKLIST_EMAIL_embed', 'YELLOW_SKU_embed',
       'TPF_HIGH_YELLOW_embed', 'EXTERNAL_CREDIT_CHECK_DONE_embed',
       'IS_EXISTING_CUSTOMER_embed', 'FRAUDNET_RESULT_embed',
       'LINE_LIMIT_REACHED_embed', 'IDA_RESULT_embed',
       'EXPERIAN_RESPONSE_1_embed', 'EXPERIAN_RESPONSE_2_embed',
       'SHIPPING_STATE_embed', 'SALES_CHANNEL_embed', 'DEVICE_AT_HOME_embed',
       'SSN_BLACKLISTED_embed', 'DENIAL_OF_SERVICES_embed',
       'FPF_HIGH_YELLOW_embed', 'SHIPPINGMETHOD_embed',
       'INTERNAL_CREDIT_CHECK_DONE_embed', 'ORDER_EXCHANGE_embed', 'SKU_embed',
       'BYOD_embed', 'CARRIER_embed', 'MODEL_embed', 'TYPE_embed',
       'MAKE_embed'],
      dtype='object')

In [45]:
categorical_embed_df.shape

(239276, 26)

In [46]:
order_num_df = pd.read_csv('ivr_backup_data/order_num_df.csv')

In [47]:
numerical_features = order_num_df.columns.tolist()

In [48]:
train_num_df = inner_train_df[numerical_features]

In [49]:
train_num_df.shape

(239276, 30)

In [50]:
train_num_df.columns

Index(['FRAUDNET_SCORE', 'TOTAL_PRICE', 'PRICE3', 'PRICE4', 'PRICE5',
       'NUM_PORTIN', 'ACNT_BILL_FNAME_MATCHES', 'ACNT_BILL_LNAME_MATCHES',
       'ACNT_SHIP_FNAME_MATCHES', 'ACNT_SHIP_LNAME_MATCHES',
       'SHIP_BILL_FNAME_MATCHES', 'SHIP_BILL_LNAME_MATCHES',
       'BILL_SHIP_ADDR_MATCHES', 'EMAIL', 'IP_ADDRESS', 'PHONE_MATCHES',
       'NUM_BYOD', 'INSTALLMENT_AMOUNT', 'ONETIMECHARGE',
       'MONTHLYRECURRINGCHARGE', 'BYOD_RETRY_COUNT', 'PREORDER_LINES',
       'ALLOWED_LINES', 'APPROVED_LINES', 'EXTERNAL_APPROVED_LINES',
       'FIRST_PARTY_ID_SCORE', 'THIRD_PARTY_ID_SCORE', 'PRICE1', 'PRICE2',
       'ORDER_CREATED_DATE'],
      dtype='object')

In [92]:
# train_num_df['ORDER_CREATED_DATE'].head()

In [51]:
train_num_df.iloc[0]['TOTAL_PRICE'],train_num_df.iloc[0]['PRICE1'],train_num_df.iloc[0]['PRICE2'],train_num_df.iloc[0]['PRICE3'],train_num_df.iloc[0]['PRICE4'],train_num_df.iloc[0]['PRICE5']

(-0.12088332448448218,
 -0.18048066866541132,
 -0.36777034283107823,
 -0.15598703772019107,
 -0.08757228165958053,
 -0.04074294517591658)

In [52]:
order_num_df.iloc[0]['TOTAL_PRICE'],order_num_df.iloc[0]['PRICE1'],order_num_df.iloc[0]['PRICE2'],order_num_df.iloc[0]['PRICE3'],order_num_df.iloc[0]['PRICE4'],order_num_df.iloc[0]['PRICE5']

(1554.6, 599.99, 599.99, 119.99, 119.99, 0.0)

In [53]:
train_num_df.head()

Unnamed: 0,FRAUDNET_SCORE,TOTAL_PRICE,PRICE3,PRICE4,PRICE5,NUM_PORTIN,ACNT_BILL_FNAME_MATCHES,ACNT_BILL_LNAME_MATCHES,ACNT_SHIP_FNAME_MATCHES,ACNT_SHIP_LNAME_MATCHES,...,BYOD_RETRY_COUNT,PREORDER_LINES,ALLOWED_LINES,APPROVED_LINES,EXTERNAL_APPROVED_LINES,FIRST_PARTY_ID_SCORE,THIRD_PARTY_ID_SCORE,PRICE1,PRICE2,ORDER_CREATED_DATE
0,-0.325847,-0.120883,-0.155987,-0.087572,-0.040743,-1.061545,0.416314,0.365419,0.348549,0.322801,...,-0.428399,-0.193577,-0.128534,-0.751217,1.7287,0.25469,0.699868,-0.180481,-0.36777,0.60058
1,-0.325847,-1.040878,-0.155987,-0.087572,-0.040743,0.240897,0.416314,0.365419,0.348549,0.322801,...,2.33426,-0.193577,0.70618,-0.751217,-0.885189,0.465524,0.708396,1.040141,-0.36777,0.318066
2,-0.325847,-0.362564,2.590948,-0.087572,-0.040743,4.148221,0.416314,0.365419,0.348549,0.322801,...,2.33426,-0.193577,0.70618,1.383215,-0.885189,-1.056148,-0.630464,-0.180481,1.138291,0.547005
3,-0.325847,-1.040878,-0.155987,-0.087572,-0.040743,0.240897,0.416314,0.365419,0.348549,0.322801,...,2.33426,-0.193577,0.70618,1.383215,-0.885189,0.034689,0.009119,-1.672325,-0.36777,0.663532
4,-0.325847,-0.037609,-0.155987,-0.087572,-0.040743,0.240897,-2.40202,0.365419,0.348549,0.322801,...,-0.428399,-0.193577,-1.797961,-0.751217,0.683144,-0.588646,-0.852186,-0.45173,-0.36777,0.031195


In [71]:
# lets drop PRICEn and keep only TOATL_PRICE
train_num_df.drop(columns=['PRICE1','PRICE2','PRICE3','PRICE4','PRICE5'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [72]:
train_num_df.columns

Index(['FRAUDNET_SCORE', 'TOTAL_PRICE', 'NUM_PORTIN',
       'ACNT_BILL_FNAME_MATCHES', 'ACNT_BILL_LNAME_MATCHES',
       'ACNT_SHIP_FNAME_MATCHES', 'ACNT_SHIP_LNAME_MATCHES',
       'SHIP_BILL_FNAME_MATCHES', 'SHIP_BILL_LNAME_MATCHES',
       'BILL_SHIP_ADDR_MATCHES', 'EMAIL', 'IP_ADDRESS', 'PHONE_MATCHES',
       'NUM_BYOD', 'INSTALLMENT_AMOUNT', 'ONETIMECHARGE',
       'MONTHLYRECURRINGCHARGE', 'BYOD_RETRY_COUNT', 'PREORDER_LINES',
       'ALLOWED_LINES', 'APPROVED_LINES', 'EXTERNAL_APPROVED_LINES',
       'FIRST_PARTY_ID_SCORE', 'THIRD_PARTY_ID_SCORE', 'ORDER_CREATED_DATE'],
      dtype='object')

In [73]:
embed_df = pd.concat([categorical_embed_df,train_num_df], axis=1)

In [74]:
embed_df.shape

(239276, 51)

In [75]:
embed_columns = embed_df.columns
embed_columns

Index(['BILLING_STATE_embed', 'BLACKLIST_EMAIL_embed', 'YELLOW_SKU_embed',
       'TPF_HIGH_YELLOW_embed', 'EXTERNAL_CREDIT_CHECK_DONE_embed',
       'IS_EXISTING_CUSTOMER_embed', 'FRAUDNET_RESULT_embed',
       'LINE_LIMIT_REACHED_embed', 'IDA_RESULT_embed',
       'EXPERIAN_RESPONSE_1_embed', 'EXPERIAN_RESPONSE_2_embed',
       'SHIPPING_STATE_embed', 'SALES_CHANNEL_embed', 'DEVICE_AT_HOME_embed',
       'SSN_BLACKLISTED_embed', 'DENIAL_OF_SERVICES_embed',
       'FPF_HIGH_YELLOW_embed', 'SHIPPINGMETHOD_embed',
       'INTERNAL_CREDIT_CHECK_DONE_embed', 'ORDER_EXCHANGE_embed', 'SKU_embed',
       'BYOD_embed', 'CARRIER_embed', 'MODEL_embed', 'TYPE_embed',
       'MAKE_embed', 'FRAUDNET_SCORE', 'TOTAL_PRICE', 'NUM_PORTIN',
       'ACNT_BILL_FNAME_MATCHES', 'ACNT_BILL_LNAME_MATCHES',
       'ACNT_SHIP_FNAME_MATCHES', 'ACNT_SHIP_LNAME_MATCHES',
       'SHIP_BILL_FNAME_MATCHES', 'SHIP_BILL_LNAME_MATCHES',
       'BILL_SHIP_ADDR_MATCHES', 'EMAIL', 'IP_ADDRESS', 'PHONE_MATCHES',
       'NU

In [79]:
# combine all features into one large vector (51 individual variables)
# this is equivalent to a single word in an NLP application
def concat_all_fields(row):
#     one_vector = np.array([])
    all_column_array = []
    for column in embed_columns:
        all_column_array.append(row[column])
        
    one_vector = np.hstack(all_column_array)
        
#     print('one_vector={}'.format(one_vector))
    return one_vector

In [225]:
# train_category_df['ACCOUNT_NUMBER'].head()

In [76]:
embed_one_vector['ACCOUNT_NUMBER'] = train_category_df.loc[:, ['ACCOUNT_NUMBER']]
# embed_one_vector = pd.DataFrame(columns=['ACCOUNT_NUMBER'])
# embed_one_vector['ACCOUNT_NUMBER'] = train_category_df['ACCOUNT_NUMBER']

In [77]:
embed_one_vector.head()

Unnamed: 0,ACCOUNT_NUMBER,embedded
0,716308354,"[-0.07455428689718246, 0.01656944677233696, -0..."
1,2387138088,"[-0.07455428689718246, 0.01656944677233696, -0..."
2,2297289602,"[-0.07455428689718246, 0.01656944677233696, -0..."
3,2747556495,"[-0.07455428689718246, 0.01656944677233696, -0..."
4,3159523017,"[-0.07455428689718246, 0.01656944677233696, -0..."


In [78]:
embed_one_vector.shape

(239276, 2)

In [256]:
concat_value = concat_all_fields(embed_df.iloc[0])

concat_value

array([-0.074554,  0.016569, -0.032478, -0.034625, ...,  1.7287  ,  0.25469 ,  0.699868,  0.60058 ])

In [80]:
# one embedded is equuvalent to a word
embed_one_vector['embedded'] = embed_df.apply(lambda x: concat_all_fields(x), axis=1)

In [82]:
embed_one_vector.iloc[0]['embedded']

array([-0.074554,  0.016569, -0.032478, -0.034625, ...,  1.7287  ,  0.25469 ,  0.699868,  0.60058 ])

In [110]:
word_dim = len(embed_one_vector.iloc[0]['embedded'])
word_dim

263

In [84]:
embed_one_vector.shape

(239276, 2)

In [85]:
embed_one_vector.head()

Unnamed: 0,ACCOUNT_NUMBER,embedded
0,716308354,"[-0.07455428689718246, 0.01656944677233696, -0..."
1,2387138088,"[-0.07455428689718246, 0.01656944677233696, -0..."
2,2297289602,"[-0.07455428689718246, 0.01656944677233696, -0..."
3,2747556495,"[-0.07455428689718246, 0.01656944677233696, -0..."
4,3159523017,"[-0.07455428689718246, 0.01656944677233696, -0..."


In [86]:
unique_accounts = embed_one_vector['ACCOUNT_NUMBER'].unique()

In [87]:
unique_accounts

array([ 716308354, 2387138088, 2297289602, 2747556495, ..., 1170435558, 1612196229, 1450419129, 1699168859])

In [88]:
len(unique_accounts)

191927

In [94]:
# add order date into embed_one_vector
embed_one_vector['ORDER_CREATED_DATE'] = embed_df['ORDER_CREATED_DATE']
embed_one_vector.head()

Unnamed: 0,ACCOUNT_NUMBER,embedded,ORDER_CREATED_DATE
0,716308354,"[-0.07455428689718246, 0.01656944677233696, -0...",0.60058
1,2387138088,"[-0.07455428689718246, 0.01656944677233696, -0...",0.318066
2,2297289602,"[-0.07455428689718246, 0.01656944677233696, -0...",0.547005
3,2747556495,"[-0.07455428689718246, 0.01656944677233696, -0...",0.663532
4,3159523017,"[-0.07455428689718246, 0.01656944677233696, -0...",0.031195


In [111]:
zero_vector = [0] * word_dim

In [112]:
sentence_dim = 50

In [101]:
# need to combine all words of an account into a sentence in chronological order
# let's sort by date before doing groupby
embed_one_vector.sort_values(['ACCOUNT_NUMBER','ORDER_CREATED_DATE'], inplace=True)


In [199]:
def np_concat_words(x):
    xvalues = x.values
    
    input_x = np.empty((0, word_dim), float)
    for xvalue in xvalues:
#         print('xvalue={}'.format(xvalue))
        input_x = np.vstack((input_x, xvalue))
        
#     print('input_x shape={}'.format(input_x.shape))

    fill_array = np.zeros(((sentence_dim - len(xvalues)), word_dim))
#     print('fill_array={}'.format(fill_array))
    total_array = np.vstack((input_x, fill_array))
#     print('len total_array={}'.format(len(total_array)))
    return total_array

In [200]:
# embed_one_vector.groupby('ACCOUNT_NUMBER').embedded.head()

In [202]:
embed_sequence = embed_one_vector.groupby('ACCOUNT_NUMBER').embedded.apply(np_concat_words)

In [207]:
embed_sequence_df = embed_sequence.reset_index()

In [208]:
embed_sequence_df.head()

Unnamed: 0,ACCOUNT_NUMBER,embedded
0,571,"[[-0.004236409906297922, 0.008305749855935574,..."
1,9660,"[[-0.04253508150577545, 0.01460204366594553, 0..."
2,49364,"[[-0.03787398710846901, -0.00512707931920886, ..."
3,73801,"[[-0.04452809691429138, 0.10258151590824127, -..."
4,141229,"[[-0.03787398710846901, -0.00512707931920886, ..."


In [259]:
first_embed = embed_sequence_df.iloc[0]['embedded']

first_embed.shape

(50, 263)

In [231]:
probability = 0.2
def random_label(x):
    random_number = random.random()
#     print('random_number={}'.format(random_number))
    return random_number < probability

In [232]:
# use random label for now, because still don't have good ivr data
embed_sequence_df['label'] = embed_sequence_df.apply(lambda x: random_label(x), axis=1)

In [240]:
# label means called (True) or not called (False) by the account user
embed_sequence_df.head()

Unnamed: 0,ACCOUNT_NUMBER,embedded,label
0,571,"[[-0.004236409906297922, 0.008305749855935574,...",False
1,9660,"[[-0.04253508150577545, 0.01460204366594553, 0...",False
2,49364,"[[-0.03787398710846901, -0.00512707931920886, ...",False
3,73801,"[[-0.04452809691429138, 0.10258151590824127, -...",False
4,141229,"[[-0.03787398710846901, -0.00512707931920886, ...",False


In [234]:
embed_sequence_df.shape

(191927, 3)

In [236]:
# embed_sequence_df.to_csv('ivr_backup_data/embed_sequence_df.csv')

In [237]:
with open('ivr_backup_data/embed_sequence_df.pkl','wb') as embed_sequence_pkl:
    pkl.dump(embed_sequence_df, embed_sequence_pkl)

In [238]:
# get a reduced data set for testing
reduced_embed_sequence = embed_sequence_df.sample(n=2000)

In [275]:
with open('ivr_backup_data/reduced_embed_sequence_df.pkl','wb') as reduced_embed_sequence_pkl:
    pkl.dump(reduced_embed_sequence, reduced_embed_sequence_pkl)

In [243]:
reduced_embed_sequence.head()

Unnamed: 0,ACCOUNT_NUMBER,embedded,label
39170,876503235,"[[-0.004236409906297922, 0.008305749855935574,...",False
149770,3347797337,"[[0.06941620260477066, 0.0075980802066624165, ...",True
144452,3229798785,"[[0.040991246700286865, -0.033048197627067566,...",False
145542,3253926381,"[[-0.04363565146923065, 0.04697321727871895, -...",False
82454,1840617868,"[[-0.06808032840490341, -0.07009082287549973, ...",False


In [280]:
reduced_train_set, reduced_test_df = train_test_split(reduced_embed_sequence, test_size=0.2)


In [281]:
with open('ivr_backup_data/reduced_train_sequence_df.pkl','wb') as reduced_train_sequence_pkl:
    pkl.dump(reduced_train_set, reduced_train_sequence_pkl)
with open('ivr_backup_data/reduced_test_sequence_df.pkl','wb') as reduced_test_sequence_pkl:
    pkl.dump(reduced_test_df, reduced_test_sequence_pkl)

In [245]:
import torch
from torch.utils import data

embedded_tensors = reduced_embed_sequence['embedded'].values

# tensorData = torch.as_tensor(reduced_embed_sequence.values, dtype=torch.float32)

In [274]:
batched_tensor = np.empty((0, sentence_dim, word_dim), float)
# print('input_x shape={}'.format(input_x.shape))
for embedded_tensor in embedded_tensors:
    expanded_tensor = np.expand_dims(embedded_tensor, axis=0)
#     print('expanded_tensor shape={}'.format(expanded_tensor.shape))
    batched_tensor = np.vstack((batched_tensor, expanded_tensor))

batched_tensor.shape

(2000, 50, 263)

In [273]:
label_tensors = reduced_embed_sequence['label'].values
tensor_label = torch.as_tensor(label_tensors, dtype=torch.float32)

tensor_label.shape

torch.Size([2000])

In [261]:
# reduced_embed_sequence['embedded']

In [246]:
type(embedded_tensors)

numpy.ndarray

In [247]:
embedded_tensors.shape

(2000,)

In [255]:
embedded_tensors

array([array([[-0.004236,  0.008306, -0.070034, -0.105562, ...,  1.205922,  1.721361,  2.823282, -1.860449],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       ...,
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ]]),
       array([[ 0.069416,  0.007598, -0.093778, -0.014586, ...,  0.683144, -0.579479, -0.442853, -1.616451],
       [ 0.      ,  0.      ,  0.      ,  0.      , ..

In [248]:
embedded_tensors[0]

array([[-0.004236,  0.008306, -0.070034, -0.105562, ...,  1.205922,  1.721361,  2.823282, -1.860449],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       ...,
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ,  0.      ]])

In [252]:
type(embedded_tensors[0])

numpy.ndarray

In [253]:
embedded_tensors[0].shape

(50, 263)