In [None]:
import utils
from utils import *

In [None]:
%matplotlib inline

In [None]:
#parameters
current_dir = os.getcwd();
vocab_size=10000
max_len=200
max_words=10000
sample_size=500 #sample reocrds
train_size= 6000 # number of samples to use to train model
dropout_rate=0.2
size_embedding=10000
batch_size=128
epochs=30
num_gpu=2
lbl = LabelEncoder()

In [None]:
#create model directory if not present
%mkdir model#model directory

In [None]:
model_save_dir = current_dir + '/model/grp_type_bst_model.hdf5' # save best models 

In [None]:
#create tensorboard directory if not present
%mkdir tensorB #tensorboard directory

In [None]:
tensorboard_dir="./group_type" #tensor board sub directory

## Available Devices

In [None]:
#check available devices , how many GPU & CPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

## GPU Memory Management

In [None]:
def limit_mem():
    K.get_session().close()
    cfg = K.tf.ConfigProto(allow_soft_placement=True,log_device_placement=True)
    cfg.gpu_options.allow_growth = True
    K.set_session(K.tf.Session(config=cfg))
limit_mem()

## Import Data

In [None]:
path=os.getcwd()
df = pd.read_csv('group_type_data_2017_10_types.csv',low_memory =False)
print("Shape of dataset {}".format(df.shape))

In [None]:
#Remove unamed columns if present
df =  df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.shape

# Preprocessing

## Sampling

In [None]:
#No Sampling for now
#blank group types into uncategorised
#df.group_type.fillna("Uncategorised", inplace=True)
#sampling equal records from each class for sample dataset
#df=df.groupby('group_type').apply(lambda x: x.sample(sample_size)).reset_index(drop=True) # to create sample of equal class size
#print("Shape of sample data {}".format(df.shape))

In [None]:
#size of the dataframe
size=len(df)

In [None]:
#ID column combination of other column
ID = df['claim_claimnumber'].astype(str)+ ',' + df['exp_claimorder'].astype(str) 
# replacing blanks with 0 in target variable
df.group_type.fillna("Uncategorised",inplace=True) 
#dropping columns which has more than 20% blanks
df=df.dropna(thresh=0.8*len(df),axis=1)
#more columns to drop 
cols_to_drop=['claim_claimnumber','exp_claimorder','inc_veh_model','inc_veh_vin','claimant_addr_postalcodedenorm','claimant_cont_licensenumber']
#drop above columns
df.drop(cols_to_drop,axis=1,inplace=True)
print('shape of dataframe {}'.format(df.shape))

In [None]:
# Examine unique values in each column
for col in df.columns:
    print("{} has {} unique values".format(col,len(df[col].value_counts())))

## Target variable

In [None]:
target = df.group_type
df.drop(['group_type'],axis=1,inplace=True)
print("target variable drop from training data, shape of training data {}".format(df.shape))

# Handle Categorical variables

## Identify text & categorical features

In [None]:
text_features = 'claim_description'
cat_features = [col for col in df.columns if col not in text_features]
#cat_features_hash = [col+"_hash" for col in cat_features]

## Clean Categroical Data

In [None]:
#clean categorical data
for col in cat_features:
    if df[col].dtypes=='O':
        df[col].fillna("unk",inplace=True)  #replace blank categories as unknown
        df[col]=df[col].apply(lambda x : clean_categorical_data(x))
print("Categorical columns cleaning done")

## Encode Categorical Data

In [None]:
for col in cat_features:    
    df[col] = pd.factorize(df[col])[0]
print("Categroical features are encoded")

## Claim Description clean

In [None]:
import time
start = time.time()
df['claim_description'] =df.claim_description.apply(lambda x : clean(x))
print("Time taken to clean {} mins".format((time.time()-start)/60))

## Tokenization

In [None]:
trn_cat = df[cat_features].values #categorical features

In [None]:
def tokenize(text):    
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text) 
    trn_text = tokenizer.texts_to_sequences(text)
    train_text = pad_sequences(trn_text,maxlen=max_len)
    return train_text
trn_text = tokenize(df['claim_description']) #text feature
print("Text data shape {}".format(trn_text.shape))

In [None]:
#target variable label encoding and one hot encode
lbl.fit(target)
encoded_target = lbl.transform(target)
encoded_target= tf.keras.utils.to_categorical(encoded_target,num_classes=None)
print("Output shape {}".format(encoded_target.shape))

# Modelling

## Model Architecture

In [None]:
def get_model():    
        # categorical channel
        with tf.name_scope("Input_Category"):
            inputs1 = Input(shape=(trn_cat.shape[1],))
        with tf.name_scope('Dense_cat'):
            dense_cat_1 = Dense(256, activation='relu')(inputs1)
            dense_cat_2 = Dense(32, activation='relu')(dense_cat_1)
        with tf.name_scope('Flat_1'):
            flat1 = Dense(32, activation='relu')(dense_cat_2)


        # text channel
        with tf.name_scope('Input_Text'):
            inputs2 = Input(shape=(trn_text.shape[1],))
            embedding2 = Embedding(size_embedding, 50,)(inputs2)
        with tf.name_scope('GRU'):
            gru = Bidirectional(GRU(256,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(embedding2)
        with tf.name_scope('Convolution'):    
            conv1 = Conv1D(filters=32, kernel_size=8, activation='relu')(gru)
        with tf.name_scope('Dropout'):
            drop1 = Dropout(dropout_rate)(conv1)
        with tf.name_scope('MaxPool'):
            pool1 = MaxPooling1D(pool_size=2)(drop1)
        with tf.name_scope('Flat_2'):
            flat2 = Flatten()(pool1)

        # merge
        with tf.name_scope('Merge'):
            merged = concatenate([flat1,flat2])
        with tf.name_scope('Dense'):
            dense1 = Dense(200, activation='relu')(merged)
            dense2 = Dense(20, activation='relu')(dense1)
        with tf.name_scope('Output'):
            outputs = Dense(12, activation='softmax')(dense2)
        model = Model(inputs=[inputs1,inputs2], outputs=outputs)
        return model

In [None]:
model = get_model()
print(model.summary())

## Compile

In [None]:
from keras.optimizers import SGD,Adamax,Nadam

In [None]:
with tf.device('/cpu:0'):
    model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=1e-5),metrics=['accuracy'])

In [None]:
#### For multi gpus 
#first run model on single gpu then multiple
parallel_model = multi_gpu_model(model,3)
with tf.device('/cpu:0'):
    parallel_model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=1e-5),metrics=['accuracy'])                    

## Training

In [None]:
#CallBacks
check_point = ModelCheckpoint(model_save_dir, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)
tbCallBack = TensorBoard(log_dir=tensorboard_dir,
                         histogram_freq=1,
                         write_graph=True,
                         write_images=True)

###  Single gpu/cpu train

In [None]:
#first run model on single gpu then multiple
with tf.device('/device:GPU:1'):
    model.fit([trn_cat[:6000],trn_text[:6000]], encoded_target[:6000], batch_size=128, epochs=10, validation_split=0.2,callbacks=[check_point,early_stop,tbCallBack])

### Multi-gpu train

In [None]:
#first run model on single gpu then multiple
parallel_model.fit([trn_cat[:60000],trn_text[:60000]], encoded_target[:60000], batch_size=128, epochs=5, validation_split=0.2,callbacks=[check_point,early_stop,tbCallBack])

## Prediction

In [None]:
preds = model.predict([trn_cat[train_size:size],trn_text[train_size:size]],batch_size=batch_size)
preds_classes = preds.argmax(axis=-1)
predictions = lbl.inverse_transform(preds_classes)