# Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import neural_network
import train_neural_network
import plot

import warnings
warnings.filterwarnings("ignore")

# Reading and dividing the data into train and test

In [2]:
df = pd.read_csv('weighted_webex_nov19_features.csv')
Y = df["ONAed"]
X = df.drop(["Email", "User", "ONAed"], axis =1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

# Training Logistic Regression

In [3]:
lr_model = LogisticRegression(solver='lbfgs', max_iter=10000)
lr_model.fit(X_train.values, Y_train.values)

LogisticRegression(max_iter=10000)

# Converting the dataframe into tensors and training the neural network

In [4]:
train_dataset = neural_network.makeDataset(X_train.values, Y_train.values)
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=15,shuffle=True)

test_dataset = neural_network.makeDataset(X_test.values, Y_test.values)
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=1,shuffle=False)

nn_model = neural_network.makeModel(X_train.shape[1], [400, 200, 100, 50], 1)

gpu = True
gpu = gpu and torch.cuda.is_available() # to know if you actually can use the GPU
net, loss_list, index_list = train_neural_network.training_routine(nn_model, train_loader, 10, gpu)

---------------------------------------------------------------------
Epoch:  1
---------------------------------------------------------------------
Running Loss:  1.5063525363419887
---------------------------------------------------------------------
Epoch:  2
---------------------------------------------------------------------
Running Loss:  1.3492705742444566
---------------------------------------------------------------------
Epoch:  3
---------------------------------------------------------------------
Running Loss:  1.2761428277145166
---------------------------------------------------------------------
Epoch:  4
---------------------------------------------------------------------
Running Loss:  1.246924603755107
---------------------------------------------------------------------
Epoch:  5
---------------------------------------------------------------------
Running Loss:  1.235617983153229
---------------------------------------------------------------------
Epoch:  6
--

# Converting numpy array to tensors for explaining predictions

In [5]:
class makeDataset(Dataset):
    def __init__(self, x):
        self.x = x

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx]

# Initializing the Lime Explainer for Tabular Datasets

In [6]:
import lime
import lime.lime_tabular

explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
                                                   feature_names=list(X_train.columns))

# Writing a custom function that returns the weighted sum of probabilites from Neural Network and Logistic Regression and passing that function to the Lime Explainer 

In [18]:
def essemble_score(x):
    dataset = makeDataset(x)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    lr_pred = lr_model.predict_proba(x)
    for y in loader:
        nn_pred = float(str(net(y.float()).cpu().detach().numpy())[2:-2])

    score = (lr_pred + nn_pred) / 2
    
    if score[:,1][0] >=0.5:
        return score
    elif score[:,1][0] >= 0.04:
        return 1 - score
    else:
        return score

fitted_results = lambda x: essemble_score(x)
exp = explainer.explain_instance(X_test.values[1], fitted_results)

# Modify the labels based on the threshold we want to choose and change the labels accordingly

In [23]:
def check_prob(prob):
    if prob[1] > 0.1:
        prob[1] = prob[1]
        prob[0] = 1 - prob[1]
        return prob
    else:
        prob[0] = 1 - prob[1]
        return prob
    
def plot_para(prob, l):
    if prob[0] < prob[1]:
        pos = l[l[1] > 0]
        neg = l[l[1] < 0]
    else:
        pos = l[l[1] > 0]
        neg = l[l[1] < 0]
    return pos, neg

In [24]:
prob = exp.predict_proba
prob = check_prob(prob)
prob

array([0.13451405, 0.86548595])

In [20]:
l = pd.DataFrame(exp.as_list())
l

Unnamed: 0,0,1
0,SLisDL <= 0.00,0.079201
1,weighted <= 0.00,0.064824
2,SLisSL > 0.00,-0.045456
3,DLisDL > 0.00,-0.024363
4,DirectMember <= 0.00,0.0171
5,DirectLeader <= 0.00,0.014703
6,duration_six_eight <= 0.00,-0.009478
7,duration_one_five <= 0.00,0.005294


# Modifying the labels based on the probability score 

In [21]:
probability = {}
probability['neg'] = prob[0]
probability['pos'] = prob[1]

pos, neg = plot_para(prob, l)

# Plotting the results 

In [25]:
plot.plot_details(neg, pos)
