In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
import datetime as dt

#Useful packages for building deep neural networks. 
import tensorflow as tf 
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D,Flatten,Dense,Dropout, Reshape,MaxPooling2D,Conv2D, LSTM
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense

#Additional library which we will use for preprocessing our image data before training our model and to provide some specific evaluation metrics.
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

In [None]:
#LOAD THE CLEANED DATASET
import pickle
file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/Test_Data_Cleaned/test_data_minmax_df3","rb")
test_df3 = pickle.load(file)
file.close()


In [None]:
#--------- Helper Functions -----------------
import random
from sklearn.utils import shuffle

#SORT DFs FIRST, ASSURE THAT CUSTOMERS LOCATED AT SAME INDICES ACROSS TABLES
def undersample(arr,labels,amount):
  churn_ratio_old = (labels==1).sum()/(len(labels))

  ind_non_churners = list(np.where(labels==0)[0])
  print(len(ind_non_churners))
  ind_to_drop = random.sample(ind_non_churners,amount)
  arr = np.delete(arr,ind_to_drop,axis=0)
  labels = np.delete(labels,ind_to_drop)
  print(arr.shape, labels.shape)

  churn_ratio_new = (labels==1).sum()/(len(labels))
  print('old churn ratio vs new churn ratio: {} vs. {}'.format(churn_ratio_old,churn_ratio_new))
  return arr, labels


def oversample(arr,labels,times): #time = number of times the churners should be re-added to the dataframe 
   churn_ratio_old = (labels==1).sum()/(len(labels))
   ind_churners = list(np.where(labels==1)[0])
   print(len(ind_churners))
   print(ind_churners)
   for i in range(times):
     tmp_train = arr[ind_churners]
     tmp_labels = labels[ind_churners]
     arr = np.concatenate([arr,tmp_train],axis=0) #Add churners to the end
     labels = np.concatenate([labels,tmp_labels])

   print(arr.shape,labels.shape)
   #Shuffle everything
   arr,labels = shuffle(arr,labels,random_state=42)
   return arr,labels

In [None]:
# ---------------- TRANSFORM DATA TO BE IN CORRECT FORMAT FOR LSTM  ------------------------

def lsmt_transform(filename_load,save_suffix):
  #Load dataset month 1
  file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/Training_Data_Cleaned/{}_1".format(filename_load),"rb")
  train_df1 = pickle.load(file)
  file.close()

  train_df1.sort_values(by=['client_id'],inplace=True) #Sort values for consistent row positions per customer across dataframes
  train_df1.drop(["client_id"],inplace=True,axis=1) #Drop client id
  train_df1.reset_index(drop=True,inplace=True)

  #Load dataset month 2
  file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/Training_Data_Cleaned/{}_2".format(filename_load),"rb")
  train_df2 = pickle.load(file)
  file.close()

  train_df2.sort_values(by=['client_id'],inplace=True) #Sort values for consistent row positions per customer across dataframes
  train_df2.drop(["client_id"],inplace=True,axis=1) #Drop client id
  train_df2.reset_index(drop=True,inplace=True)


  #Load dataset month 3
  file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/Training_Data_Cleaned/{}_3".format(filename_load),"rb")
  train_df3 = pickle.load(file)
  file.close()

  train_df3.sort_values(by=['client_id'],inplace=True) #Sort values for consistent row positions per customer across dataframes
  y_train_lstm = train_df3["target"]
  train_df3.drop(["client_id","target"],inplace=True,axis=1) #Drop client id
  train_df3.reset_index(drop=True,inplace=True)


  #create arrays to accumulate datasets
  X_train_lstm = np.empty((train_df1.shape[0],3,train_df1.shape[1]),dtype='float')

  #ACCUMULATE TRAINING DF
  #Add all observations from train_df1
  for x in range(len(train_df1)):
    X_train_lstm[x][0] = train_df1.iloc[x,:]
    #Add all observations from train_df2
  for x in range(len(train_df2)):
    X_train_lstm[x][1] = train_df2.iloc[x,:]
  #Add all observations from train_df3
  for x in range(len(train_df3)):
    X_train_lstm[x][2] = train_df3.iloc[x,:]

  print(X_train_lstm.shape)


  
  file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/X_train_lstm_robust_{}".format(save_suffix),'wb')
  pickle.dump(X_train_lstm, file)
  file.close()
  file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/y_train_lstm_robust_{}".format(save_suffix),'wb')
  pickle.dump(y_train_lstm, file)
  file.close()


lsmt_transform("train_data_robust","robust")



(61468, 3, 38)


In [None]:
#--------------- CREATE CLASS DICTIONARY + SPLIT TEST SET -----------------
# IF DATA HAS ALREADY BEEN SAVED,JUST LOAD IT
file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/X_train_lstm_robust_robust",'rb')
X_train_lstm = pickle.load(file)
file.close()
file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/y_train_lstm_robust_robust",'rb')
y_train_lstm = np.array(pickle.load(file))
file.close()



# Apply sampling for class balancing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# oversample = SMOTE(sampling_strategy=0.2) #Generate new examples
# X_train_lstm,y_train_lstm = oversample.fit_resample(X_train_lstm, y_train_lstm)
#undersample
#oversample

X_train_lstm,y_train_lstm = oversample(X_train_lstm,y_train_lstm,times=2)
X_train_lstm,y_train_lstm = undersample(X_train_lstm,y_train_lstm,20000)




# Split of validation set
val_indices = np.random.choice(a = len(X_train_lstm), size = int(len(X_train_lstm)*0.1))
X_val_lstm, y_val_lstm = X_train_lstm[val_indices], y_train_lstm[val_indices]
X_train_lstm, y_train_lstm = np.delete(arr=X_train_lstm,obj=val_indices,axis=0), np.delete(arr=y_train_lstm,obj=val_indices,axis=0)

#create class weights dictionary
classes = np.unique(y_train_lstm,return_counts=True)[0]
class_weights_arr = sklearn.utils.class_weight.compute_class_weight(class_weight = 'balanced', classes = classes, y = y_train_lstm)
print(class_weights_arr)
class_weights_dict = {} #input to model.fit requires dictionary
for i in classes:
  class_weights_dict[int(i)] = class_weights_arr[int(i)]
print("target attribute weights to handle class imbalance:{}".format(class_weights_dict))


1824
[1, 9, 22, 53, 78, 92, 106, 151, 180, 190, 225, 237, 331, 335, 337, 437, 465, 510, 514, 523, 532, 537, 568, 579, 596, 626, 639, 645, 648, 658, 788, 819, 820, 848, 891, 921, 968, 998, 1014, 1025, 1071, 1082, 1083, 1123, 1249, 1255, 1343, 1369, 1432, 1467, 1491, 1505, 1510, 1535, 1536, 1578, 1617, 1640, 1645, 1674, 1677, 1682, 1712, 1783, 1810, 1819, 1849, 1886, 1936, 1976, 2033, 2059, 2071, 2188, 2242, 2262, 2279, 2313, 2317, 2345, 2347, 2382, 2396, 2418, 2454, 2455, 2457, 2461, 2463, 2473, 2529, 2572, 2628, 2651, 2659, 2671, 2678, 2703, 2747, 2768, 2814, 2818, 2861, 2886, 2893, 2989, 3015, 3038, 3063, 3105, 3114, 3141, 3149, 3276, 3283, 3382, 3412, 3479, 3536, 3552, 3623, 3656, 3790, 3849, 3955, 3982, 4018, 4025, 4191, 4225, 4240, 4254, 4317, 4336, 4418, 4470, 4504, 4534, 4615, 4628, 4681, 4745, 4774, 4786, 4877, 4888, 4918, 4923, 4954, 4973, 4999, 5060, 5091, 5187, 5262, 5266, 5285, 5335, 5353, 5411, 5423, 5474, 5548, 5600, 5601, 5626, 5655, 5709, 5724, 5763, 5840, 5844, 5893, 59

In [None]:
#CUSTOM LOSS FUNCTIONS
#Keras
ALPHA = 0.8
BETA = 0.2

def TverskyLoss(targets, inputs, alpha=ALPHA, beta=BETA, smooth=1e-6):
        
        #flatten label and prediction tensors
        inputs = tf.cast(K.flatten(inputs), dtype="float32")
        targets =tf.cast(K.flatten(targets), dtype="float32")
        
        #True Positives, False Positives & False Negatives
        TP = K.sum((inputs * targets))
        FP = K.sum(((1-targets) * inputs))
        FN = K.sum((targets * (1-inputs)))
       
        Tversky = (TP + smooth) / (TP + alpha*FP + beta*FN + smooth)  
        
        return 1 - Tversky

def DiceLoss(targets, inputs, smooth=1e-6):
    
    #flatten label and prediction tensors
    inputs = K.flatten(inputs)
    targets = K.flatten(targets)
    
    intersection = K.sum(K.dot(targets, inputs))
    dice = (2*intersection + smooth) / (K.sum(targets) + K.sum(inputs) + smooth)
    return 1 - dice
  

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

from keras.layers import GlobalMaxPooling1D,GlobalAveragePooling1D,MaxPooling2D,AveragePooling2D, Flatten
def build_lstm(input_shape):
  x_input = Input(input_shape)
  x = x_input
  x = Conv1D(filters = 256, kernel_size=3, activation = 'relu')(x)
  x = Dropout(0.3)(x)
  x = Conv1D(filters = 128, kernel_size=1, activation = 'linear')(x)
  x = Dropout(0.3)(x)
  # x = Conv1D(filters = 32, kernel_size=1, activation = 'linear')(x)
  # x = Dropout(0.3)(x)
  # x = LSTM(160, activation='relu')(x)
  # x = Dropout(0.3)(x)
  x = Flatten(x)
  x = Dense(units= 256, activation = 'relu')(x)
  x = Dropout(0.3)(x)  
  x = Dense(units= 256, activation = 'relu')(x)
  x = Dropout(0.3)(x) 
  x = Dense(units= 128, activation = 'relu')(x)
  x = Dropout(0.3)(x)
  x = Dense(units= 64, activation = 'relu')(x)
  x = Dense(1, activation='sigmoid')(x)

  # Create model
  model = Model(inputs = x_input, outputs = x)

  return model

# initialize optimizer 
adm = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)


# initialize and train model
model = build_lstm((3,38)) 
model.compile(loss=TverskyLoss, optimizer=adm, metrics=[tf.keras.metrics.AUC(),'accuracy'])
model.summary()


model.fit(X_train_lstm, 
          y_train_lstm, 
          validation_data=(X_val_lstm,y_val_lstm),
          epochs=20, 
          batch_size=32,
          class_weight = class_weights_dict
          )

#Show predictive performance - ADD AUC ASS WELL!
predictions_proba = model.predict(X_val_lstm)
predictions = []
for x in predictions_proba:
  if x>0.5:
    predictions.append(1)
  else:
    predictions.append(0)

#AUC Curve is desirable here to evaluate the effect of different cut-off values for the predictions

print('Overall classification report:') 
print(classification_report(y_val_lstm,predictions))

print('\nConfusion matrix:')
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_val_lstm, predictions, normalize = 'true') 
plt.show() 

Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_30 (InputLayer)       [(None, 3, 38)]           0         
                                                                 
 conv1d_36 (Conv1D)          (None, 1, 256)            29440     
                                                                 
 dropout_73 (Dropout)        (None, 1, 256)            0         
                                                                 
 conv1d_37 (Conv1D)          (None, 1, 128)            32896     
                                                                 
 dropout_74 (Dropout)        (None, 1, 128)            0         
                                                                 
 dense_60 (Dense)            (None, 1, 256)            33024     
                                                                 
 dropout_75 (Dropout)        (None, 1, 256)            0  

ValueError: ignored

In [None]:
# --------------- CREATE PREDICTIONS FOR IN CLASS SUBMISSIONS ---------------------------

# # ---------------- TRANSFORM TEST DATA TO BE IN CORRECT FORMAT FOR LSTM  ------------------------

# #LOAD PREPROCESSED DATASETS
# #Load dataset month 1
# file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/Test_Data_Cleaned/test_data_robust_1","rb")
# test_df1 = pickle.load(file)
# file.close()

# test_df1.sort_values(by=['client_id'],inplace=True) #Sort values for consistent row positions per customer across dataframes
# client_IDs_X_test_lstm = test_df1['client_id']
# test_df1.drop(["client_id"],inplace=True,axis=1) #Drop client id
# test_df1.reset_index(drop=True,inplace=True)

# #Load dataset month 2
# file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/Test_Data_Cleaned/test_data_robust_2","rb")
# test_df2 = pickle.load(file)
# file.close()

# test_df2.sort_values(by=['client_id'],inplace=True) #Sort values for consistent row positions per customer across dataframes
# test_df2.drop(["client_id"],inplace=True,axis=1) #Drop client id
# test_df2.reset_index(drop=True,inplace=True)

# #Load dataset month 3
# file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/Test_Data_Cleaned/test_data_robust_3","rb")
# test_df3 = pickle.load(file)
# file.close()

# test_df3.sort_values(by=['client_id'],inplace=True) #Sort values for consistent row positions per customer across dataframes
# test_df3.drop(["client_id"],inplace=True,axis=1) #Drop client id
# test_df3.reset_index(drop=True,inplace=True)


# #create array to accumulate datasets
# X_test_lstm = np.empty((test_df1.shape[0],3,test_df1.shape[1]),dtype='float')
# #Add all observations from test_df1
# for x in range(len(test_df1)):
#   X_test_lstm[x][0] = test_df1.iloc[x,:]
#   #Add all observations from test_df2
# for x in range(len(test_df2)):
#   X_test_lstm[x][1] = test_df2.iloc[x,:]
# #Add all observations from test_df3
# for x in range(len(test_df3)):
#   X_test_lstm[x][2] = test_df3.iloc[x,:]

# print(X_test_lstm.shape)

# # Pickle the constructed files
# file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/X_test_lstm_robust",'wb')
# pickle.dump(X_test_lstm,file)
# file.close()

# file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/client_IDs_X_test_lstm_robust",'wb')
# pickle.dump(np.array(client_IDs_X_test_lstm),file)
# file.close()

# #Read out test file in format for LSTM
# file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/X_test_lstm_robust",'rb')
# X_test_lstm = pickle.load(file)
# file.close()
# #Read out the corresponding client IDs
# file = open("/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/LSTM_DATA/client_IDs_X_test_lstm_robust",'rb')
# client_IDs_X_test_lstm = pickle.load(file)
# file.close()

# predictions_proba = np.squeeze(model.predict(X_test_lstm))

# #Generate data frame to submit
# submission_arr = np.vstack((client_IDs_X_test_lstm,predictions_proba)).T
# submission_df = pd.DataFrame(submission_arr)
# submission_df = submission_df.rename(columns={0: 'ID', 1: 'PROB'})
# print(submission_df.shape, submission_df.head)

# #Write submission dataframe to csv
# from datetime import date
# import time
# today = date.today()
# today = today.strftime('%y%m%d') 
# time_now = int(time.time())
# suffix = str(today) + '_' + str(time_now)
# filepath = "/content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/submissions/submission_file_LSTM_{}".format(suffix)
# print("Path for submission file : {}".format(filepath))

# submission_df.to_csv(filepath,sep=',',header=True,index=False)


(27300, 2) <bound method NDFrame.head of                                      ID PROB
0      00005aaefc4ec95eab088bd07b1ba8c7  0.0
1      0000708c331be17d362afbf8ff78401a  0.0
2      000270b865873ac669df19a0a1a05950  0.0
3      0002754fb482f043f1b0392def2a4ab0  0.0
4      00045042cb39e224e80a7bd0388f127e  0.0
...                                 ...  ...
27295  fffa862e87dc37383f3dfe8f2f7388cf  0.0
27296  fffc95ebdc2673e2b23a510faad11cec  0.0
27297  fffd91890215fe1476650ef1b60f2b11  1.0
27298  fffe766990c6552e78f1bef3c33f7f22  0.0
27299  ffff4a236fc06d1c30c6e8b50752f8bf  0.0

[27300 rows x 2 columns]>
Path for submission file : /content/drive/MyDrive/Studies/Master of AI/Advanced_Analytics/Assignment_1/submissions/submission_file_LSTM_220509_1652101542
