In [None]:
# Pandas for table processing
import pandas as pd
import re
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Packages for auxilary data science tasks: dividing the dataset to train and test and metrics summary generation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, accuracy_score

In [None]:
data = pd.read_csv("Glass_Type.csv")

In [None]:
data

In [None]:
# Due to LightGBMError: Do not support special JSON characters in feature name. 
# Solution found in: https://stackoverflow.com/questions/60582050/lightgbmerror-do-not-support-special-json-characters-in-feature-name-the-same
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
data.Type.value_counts()

In [None]:
# Labels encoding from categories to Integers
data['Type'] = LabelEncoder().fit_transform(data['Type'])
# 0 -> 'build wind float'
# 1 -> 'build wind non-float'
# 2 -> 'vehic wind float'
# 3 -> containers
# 4 -> headlamps
# 5 -> tableware

In [None]:
def binarize(x):
    if x==5:
        value=0
    else:
        value=1
    return value
#data['Type'] = data['Type'].map(binarize)

In [None]:
data

In [None]:
# get all data of the omitted class "tableware"
omitted_class = data.loc[data['Type'] == 5]

In [None]:
data_without_omitted_class = data.loc[data['Type'] != 5]

In [None]:
data_without_omitted_class

In [None]:
data_without_omitted_class

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data_without_omitted_class.drop(columns=["Type"]), data_without_omitted_class.Type,
                                                    train_size=0.95, stratify=data_without_omitted_class.Type, random_state=42)

In [None]:
#Converting the dataset in proper LGB format
d_train=lgb.Dataset(X_train, label=Y_train)
#setting up the parameters
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclass' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
params['max_depth']=10
params['num_class']=6 #no.of unique values in the target class not inclusive of the end value
#training the model
clf=lgb.train(params,d_train,100)  #training the model on 100 epocs
#prediction on the test dataset
y_pred=clf.predict(X_test)

In [None]:
#argmax() method 
y_pred = [np.argmax(line) for line in y_pred]

In [None]:
y_pred

In [None]:
#using precision score for error metrics
precision_score(y_pred,Y_test,average=None).mean()

In [None]:
print(classification_report(Y_test, y_pred))

In [None]:
classification_result = clf.predict(X_test)

In [None]:
# Get the max value from each array from numpy matrix
# axis=1 to find max from each row
# Calculate the confidence scores SC of the winning classes, in case when 
# the trained classifier was  tested on dataset without omitted class
winningClassCs = np.amax(classification_result, axis=1)
print(winningClassCs)
print(len(winningClassCs))

In [None]:
accuracy_score(Y_test, y_pred)

In [None]:
# Test on omitted class
X_test_omitted = omitted_class.iloc[:,:-1] # all colmns without the last column
Y_test_omitted = omitted_class.iloc[:,-1:] # last colmn
y_pred_omitted=clf.predict(X_test_omitted)

In [None]:
#argmax() method 
y_pred_omitted = [np.argmax(line) for line in y_pred_omitted]

In [None]:
print(classification_report(Y_test_omitted, y_pred_omitted))

In [None]:
classification_result_omitted = clf.predict(X_test_omitted)

In [None]:
# Get the max value from each array from numpy matrix
# axis=1 to find max from each row
# Calculate the confidence scores SC of the winning classes, in case when 
# the trained classifier was tested on omitted class dataset
winningClassCsOmitted = np.amax(classification_result_omitted, axis=1)


In [None]:
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sns.set(style="darkgrid")
df = sns.load_dataset("iris")
plt.figure(figsize = (15,8))

sns.histplot(data=winningClassCs, color="skyblue", label="non-omitted", kde=True)

plt.legend() 
plt.show()

In [None]:
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sns.set(style="darkgrid")
df = sns.load_dataset("iris")
plt.figure(figsize = (15,8))

sns.histplot(data=winningClassCsOmitted, color="red", label="omitted", kde=True)

plt.legend() 
plt.show()

In [None]:
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sns.set(style="darkgrid")
df = sns.load_dataset("iris")
plt.figure(figsize = (15,8))

sns.histplot(data=winningClassCs, color="skyblue", label="non-omitted", kde=True)
sns.histplot(data=winningClassCsOmitted, color="red", label="omitted", kde=True)


plt.legend() 
plt.show()

In [None]:
import numpy as np
import scipy.stats


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h


In [None]:
mean_confidence_interval(winningClassCs)

#  Drift simulation

In [None]:
len(winningClassCs[:9200]), len(winningClassCsOmitted)

In [None]:
# Split the winning scores of non omitted data into batches of length 20
n=20
#print(winningClassCs[:9200].reshape(460,20))
winningClassCsInBatches = winningClassCs[:9200].reshape(460,20)
#winningClassCsInBatches=np.array([winningClassCs[i:i + n] for i in range(0, len(winningClassCs), n)])
#print(winningClassCsInBatches)
#print(winningClassCsOmitted)

In [None]:
def sudden_quarter():
    #Lists of before and after the changepoint
    befor_cp = winningClassCsInBatches[:230]
    after_cp = winningClassCsInBatches[230:]
    omittedToBeStacked = winningClassCsOmitted[:5*len(after_cp)].reshape(len(after_cp),5)
    print(befor_cp.shape)
    x = np.concatenate((after_cp[:,:15],omittedToBeStacked),axis=1)
    print(x.shape)
    x = np.concatenate([befor_cp,x])
    return x
print(sudden_quarter())

In [None]:
# 461 is the length of clean batches
# d = 230 the batch of the changepoint
# d =< 230 no drift 
# d > drift with proportion p

'''
sudden_quarter = for i in winningClassCsInBatches[:230]
sudden_half = 
sudden_full = 
'''

In [None]:
import rpy2
print(rpy2.__version__)

In [None]:
from rpy2.robjects.packages import importr
# import R's "base" package
base = importr('base')

# import R's "utils" package
utils = importr('utils')

In [None]:
# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

In [None]:
# R package names
packnames = ('cpm')

# R vector of strings
from rpy2.robjects.vectors import StrVector

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

In [None]:
cpm = rpackages.importr("cpm")

In [None]:
from rpy2.robjects import FloatVector
ctl = FloatVector([4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14])

In [None]:
a = cpm.detectChangePoint(ctl,"Student",ARL0=500,startup=20)

In [None]:
dataset = pd.DataFrame(np.asarray(a))
dataset

In [None]:
x = sudden_quarter()

In [None]:
x.shape

In [None]:
x.flatten().shape