In [None]:
#Only one gpu visibie
import os
%env CUDA_VISIBLE_DEVICES=3
import utils
from utils import *

In [None]:
%matplotlib inline

In [None]:
#parameters
current_dir = os.getcwd();
custom_bucket_boundary=[0,1,1000,250000] 
vocab_size=10000
max_len=200
max_words=10000
embed_size=300
max_features=10000
sample=150 #sample reocrds
train_size=1000#sample train
dropout_rate=0.2
size_embedding=10000
epochs=50
num_gpu=3
#tensorboard_dir="./Payout" #tensor board sub directory
lbl = LabelEncoder()

In [None]:
%mkdir model#model directory

In [None]:
model_save_dir = os.getcwd() + '/model/binary_model_classifier.hdf5' # save best models 

In [None]:
%mkdir tensorB #tensorboard directory

In [None]:
tensorboard_dir=os.getcwd() + "/tensorB/" #tensor board graphs

## Available devices

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
#Tensorflow check
tf.Session()

with tf.device('/gpu:0'):
    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3]) 
    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

print(sess.run(c))

# Runs the op.
# Log information
options = tf.RunOptions(output_partition_graphs=True)
metadata = tf.RunMetadata()
c_val = sess.run(c, options=options, run_metadata=metadata)

print(metadata.partition_graphs)

sess.close()

## GPU Memory Management

In [None]:
def limit_mem():
    K.get_session().close()
    cfg = K.tf.ConfigProto(allow_soft_placement=True,log_device_placement=True)
    cfg.gpu_options.allow_growth = True
    K.set_session(K.tf.Session(config=cfg))
limit_mem()

## Import Data

In [None]:
#read data file
path=os.getcwd()
df = pd.read_csv('paid.csv',low_memory =False)
print("Shape of dataset {}".format(df.shape))

In [None]:
#remove unnamed column
df =df.loc[:, ~df.columns.str.contains('^Unnamed')]
print("shape of dataframe after removing columns {}".format(df.shape))

In [None]:
#ID column combination of other column
ID = df['claimnumber'].astype(str)+ ',' + df['exp_claimorder'].astype(str) 
# replacing blanks with 0 in target variable
df.total_paid.fillna(0,inplace=True) 
#dropping columns which has more than 20% blanks
df=df.dropna(thresh=0.6*len(df),axis=1)
#more columns to drop 
cols_to_drop=['claimnumber','exp_claimorder']
#drop above columns
df.drop(cols_to_drop,axis=1,inplace=True)
print('shape of dataframe {}'.format(df.shape))

# Preprocessing

## Target variable binning

In [None]:
target = df.total_paid
target.fillna(0,inplace=True)
#TARGET BINNING
bins = [0,1,200000]
#labels = ['No_pay','Below_1000','More_than_1K'] #labels
labels = ['No_pay','Yes_pay']
#CREATE BUCKETS
df['payout_bucket'] = pd.cut(df['total_paid'], bins=bins, labels=labels)
#REPLACING BLANKS WITH 
df['payout_bucket'].fillna('No_pay',inplace=True)
print("Payout Bucket Distribution \n{}".format(df.payout_bucket.value_counts()))

print("Label encoding buckets")
lbl.fit(list(df['payout_bucket'].values))
df['payout_bucket_trans'] = lbl.transform(list(df['payout_bucket'].values))
payout = df['payout_bucket_trans'].values
print("Buckets after label encoding \n{}".format(df.payout_bucket_trans.value_counts()))

#dropping target varaible from dataset
df.drop(['total_paid','payout_bucket','payout_bucket_trans'],axis=1,inplace=True)
print("target variable drop from training data, shape of training data {}".format(df.shape))

# Handling Categorical variables

## Identify text & categorical features

In [None]:
text_features = 'claim_description'
cat_features = [col for col in df.columns if col not in text_features]
#cat_features_hash = [col+"_hash" for col in cat_features]

## Clean  categorical features

In [None]:
#clean categorical data
for col in cat_features:
    if df[col].dtypes=='O':
        df[col].fillna("unk",inplace=True)  #replace blank categories as unknown
        df[col]=df[col].apply(lambda x : clean_categorical_data(x))
print("Categorical columns cleaning done")

## Encode categorical features

### Method 1 - Factorize 

In [None]:
#Factorize
for col in cat_features:    
    df[col] = pd.factorize(df[col])[0]
trn_cat = df[cat_features] #categorical features
print("Categroical features are encoded")

### Method 2 - Dummy Variables

In [None]:
#dummy varaibles for categorical features
#df_cat = pd.get_dummies(df, columns=cat_features,
 #   sparse=True)
#df_cat.drop(['claim_description'],axis=1,inplace=True)
#trn_cat = df_cat.values #categorical features
#print("Shape of categorical varaibles {}".format(df_cat.shape))

## Claim Description clean

In [None]:
import time
start = time.time()
df['claim_description'] =df.claim_description.apply(lambda x : clean(x))
print("Time taken to clean {} mins".format((time.time()-start)/60))

## Tokenization

In [None]:
tknizer = Tokenizer(num_words=max_words)
def tokenize(text):    
    tknizer.fit_on_texts(text) #only 100 records are consider
    trn_text = tknizer.texts_to_sequences(text)
    train_text = pad_sequences(trn_text,maxlen=200)
    return train_text
train_text = tokenize(df['claim_description']) #text feature
print("Text data shape {}".format(train_text.shape))

In [None]:
#EMbed file path
EMBEDDING_FILE= "glove.840B.300d.txt"

In [None]:
#Read embed file
def get_coeff(word,*arr): return word, np.asarray(arr,dtype='float32')
embedding_index = dict(get_coeff(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE,encoding='utf-8'))

In [None]:
#Create EMbedding Matrix
word_index = tknizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words,embed_size)) #np.random.normal(emb_mean,emb_std,(nb_words,embed_size))
for word,i in word_index.items():
    if  i >= max_features : continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None : embedding_matrix[i] = embedding_vector
print("Shape of EMbedding matrix {}".format(embedding_matrix.shape))

In [None]:
##from sklearn.utils import class_weight
#weights = class_weight.compute_class_weight('balanced',np.unique(payout),payout)

## Target Variable Processing

In [None]:
#target variable one hot 
target= tf.keras.utils.to_categorical(payout,num_classes=None)
print("Output shape {}".format(target.shape))

# Modelling

## Model Architecture

In [None]:
from keras.optimizers import SGD,Adamax,Nadam

In [None]:
#new Model Arch
def get_model():    
         # categorical channel
        with tf.name_scope("Input_Category"):      
            inputs1 = Input(shape=(trn_cat.shape[1],))
        with tf.name_scope("Dense_cat"): 
            dense_cat_1 = Dense(256, activation='relu')(inputs1)
            dense_cat_2 = Dense(128, activation='relu')(dense_cat_1)
        with tf.name_scope('Flat_1'):
            flat1 = Dense(32, activation='relu')(dense_cat_2)

        # text chanel
        with tf.name_scope("Input_Text"):
            inputs3 = Input(shape=(train_text.shape[1],))
            embedding3 = Embedding(size_embedding, 300,weights=[embedding_matrix],trainable = False)(inputs3)
        with tf.name_scope("Convolution"):
            conv3 = Conv1D(filters=128, kernel_size=8,padding='valid',kernel_initializer='glorot_uniform')(embedding3)
        with tf.name_scope("Dropout"):
            drop3 = Dropout(0.3)(conv3)
        with tf.name_scope('Average_pool'):
            avg_pool = GlobalAveragePooling1D()(conv3)
        with tf.name_scope('MaxPool'):
            max_pool = GlobalMaxPooling1D()(conv3)
        with tf.name_scope("Concat"):
            x = concatenate([avg_pool,max_pool])

        # merge
        with tf.name_scope('Merge_Channels'):
            merged = concatenate([flat1,x])
        with tf.name_scope("Dense"):
            dense1 = Dense(200, activation='relu')(merged)
            dense2 = Dense(100, activation='relu')(dense1)
        with tf.name_scope("Output"):
            outputs = Dense(2, activation='sigmoid')(merged)
        model1 = Model(inputs=[inputs1,inputs3], outputs=outputs)
        return model1

In [None]:
from keras.layers import GlobalAveragePooling1D,GlobalMaxPooling1D

In [None]:
model = get_model()
model.summary()

## Training

### Single CPU/GPU 

In [None]:
with tf.device('/device:cpu:0'):
    model.compile(Adam(lr=1e-5),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.fit([trn_cat,train_text],target,batch_size=128,epochs=3,validation_split=0.1)

### Multi GPU Process

In [None]:
parallel_model = multi_gpu_model(model,3)

In [None]:
with tf.device('/device:cpu:0'):
    parallel_model.compile(Adam(lr=1e-5),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
parallel_model.fit([trn_cat,train_text],target,batch_size=128,epochs=3,validation_split=0.1)

In [None]:
del model,parallel_model,df
gc.collect()

In [None]:
K.clear_session()

In [None]:
del model,parallel_model
K.clear_session()

In [None]:
gc.collect()

## New Model Architecture - RNN & CNN combination

In [None]:
def get_model():    
        # categorical channel
        with tf.name_scope("Input_Category"):      
            inputs1 = Input(shape=(trn_cat.shape[1],))
        with tf.name_scope("Dense_cat"): 
            dense_cat_1 = Dense(256, activation='relu')(inputs1)
            dense_cat_2 = Dense(128, activation='relu')(dense_cat_1)
        with tf.name_scope('Flat_1'):
            flat1 = Dense(32, activation='relu')(dense_cat_2)

        # text chanel
        with tf.name_scope("Input_Text"):
            inputs3 = Input(shape=(train_text.shape[1],))
            embedding3 = Embedding(size_embedding, 300,weights=[embedding_matrix],trainable = False)(inputs3)
            x = SpatialDropout1D(0.2)(embedding3)
        with tf.name_scope("LSTM"):
            x1 = Bidirectional(GRU(128,return_sequences=True,dropout=0.2,recurrent_dropout=0.25))(embedding3)
        with tf.name_scope("Convolution"):
            conv3 = Conv1D(filters=128, kernel_size=8,padding='valid',kernel_initializer='glorot_uniform')(embedding3)
        with tf.name_scope("Dropout"):
            drop3 = Dropout(0.3)(conv3)
        with tf.name_scope('Average_pool'):
            avg_pool = GlobalAveragePooling1D()(conv3)
        with tf.name_scope('MaxPool'):
            max_pool = GlobalMaxPooling1D()(conv3)
        with tf.name_scope("Concat"):
            x = concatenate([avg_pool,max_pool])
  
        # merge
        with tf.name_scope('Merge_Channels'):
            merged = concatenate([flat1,x])
        with tf.name_scope("Dense"):
            dense1 = Dense(200, activation='relu')(merged)
            dense2 = Dense(100, activation='relu')(dense1)
        with tf.name_scope("Output"):
            outputs = Dense(2, activation='sigmoid')(merged)
        model1 = Model(inputs=[inputs1,inputs3], outputs=outputs)
        return model1

In [None]:
from keras.layers import GlobalAveragePooling1D,GlobalMaxPooling1D

In [None]:
model = get_model()
model.summary()

## Training

### Single CPU/GPU

In [None]:
#compile
with tf.device('/device:cpu:0'):
    model.compile(Adam(lr=1e-5),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
#run
with tf.device('/device:gpu:3'):
    model.fit([trn_cat,train_text],target,batch_size=128,epochs=3,validation_split=0.2)

In [None]:
#chainge learing rate of the model training
with tf.device('/device:cpu:0'):
    model.compile(Adam(lr=1e-3),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
#runing on gpu
with tf.device('/device:gpu:3'):
    model.fit([trn_cat,train_text],target,batch_size=128,epochs=10,validation_split=0.2)

In [None]:
del model,parallel_model
K.clear_session()

In [None]:
import gc
gc.collect()