# Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
import matlab.engine as engi
import matlab as mat
import math
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.io import loadmat

# Start matlab service

In [2]:
eng = engi.start_matlab()

# variables

In [3]:
source_data_path = 'data/Relink/Apache/Apache.csv'
target_data_path = 'data/Jureczko/ant/ant-1.7.csv'
result_path = 'result/result.csv'
repeats = 20
ratio = 0.1
lrank = 70
reg = 1E-5

# Data loading and Normalizing Data

In [4]:
def load_data(path):
    df = pd.read_csv(path)
    df_columns = df.columns
    _temp = []
    for col in df_columns:
        col = col.replace('$','')
        col = col.replace('>','')
        col = col.replace('<','')
        _temp.append(col)
    df.columns = _temp
    return df

def normalize_data(df,target_class):
    y = df[target_class]
    X = df.drop(target_class,axis = 1)
    df_columns = X.columns
    df_mat = stats.zscore(X.values.tolist())
    df = pd.DataFrame(df_mat,columns = df_columns)
#     df = pd.concat([df,y],axis = 1)
    return df,y

In [5]:
source_df = load_data(source_data_path)
target_df = load_data(target_data_path)

source_X,source_y = normalize_data(source_df,'Defects')
target_X,target_y = normalize_data(target_df,'bug')
print(source_X.shape,target_X.shape)

(194, 26) (745, 20)


# set up kernel

In [6]:
def set_kernel(df):
    dist = pdist(df)
    sigma = np.mean(dist)
    kernel = {'kernel_type':'gauss','kernel_parameter':round(1/sigma,4)}
    return kernel

In [7]:
source_df_kernel = set_kernel(source_X)
target_df_kernel = set_kernel(target_X)

# decomposing kernel correlation metrics

## python substitution

In [None]:
def km_kernel(X1,X2,kernel_type,kernel_paramater):
    if kernel_type == 'gauss':
        sgm = kernel_paramater # kernel width
        dim1 = X1.shape[0]
        dim2 = X2.shape[0]
        norms1 = np.sum(X1**2)
        norms2 = np.sum(X2**2)
        mat1 = np.ndarray(shape=(1,dim2))
        mat1 = norms1
        mat2 = np.ndarray(shape=(dim1,1))
        mat2 = norms2
    elif kernel_type == 'gauss-diag':
        sgm = kernel_paramater
        X1 = np.array(X1)
        X2 = np.array(X2)
        _pow = np.sum((X1-X2)**2,axis = 1)/(2*sgm**2)
        K = np.exp(-_pow.astype(int))
#         K = K.reshape((K.shape[0],1))
        print(K.shape)
    return K

def km_kernel_icd(X,kernel,lrank):
    kernel_type = kernel['kernel_type']
    kernel_paramater = kernel['kernel_parameter']
    n = X.shape[0]
    precision = 10**-6
    d = np.zeros((n,))  # diagonal of the residual kernel matrix
    G = np.zeros((n,lrank))
    subset = np.zeros((lrank,))
    perm = np.arange(n)
    for i in range(lrank):
        x = X[i:n]
        if i == 0:
            d[i:n] = km_kernel(x,x,kernel_type+'-diag',kernel_paramater)
            
        else:
            d[i:n] = km_kernel(x,x,kernel_type+'-diag',kernel_paramater) - np.sum(G[i:n,0:i]**2,axis = 1)
        
        dtrace = np.sum(d[i:n])
        
        if  dtrace <= 0:
            print('Negative diagonal entry', dtrace)
            
        if  dtrace <= precision:
            G[:,i:] = []
            subset[i:] = []
            break
            
        m2 = max(d[i:n])
        j = np.where(d[i:n] == m2)[0][0]
        print(i,j)
        j = j + i
        print(i,j)
        m1 = m2**0.5
        subset[j] = j
        
        temp = perm[i]
        perm[i] = perm[j]
        perm[j] = temp
        # incomplete
        
        
def conKernelMatrix(source_X,target_X,source_df_kernel,target_df_kernel,lrank):
    source_X_size = source_X.shape[0]
    target_X_size = target_X.shape[0]
    

    
    

## Matlab integration

In [13]:
def


mat_source_df = mat.double(source_df.values.T.tolist())
mat_target_df = mat.double(target_df.values.T.tolist())
X = eng.CTKCCA(mat_source_df,mat_target_df,nargout=4)
train_X,train_y,test_X,test_y = np.array(X[0])

In [None]:
clf = LogisticRegression()
clf.fit(G1,source_y)

In [None]:
predicted = clf.predict(G2)
print(classification_report(target_y, predicted))

In [None]:
source_df_y = source_df.Defects
source_df_X = source_df.drop('Defects',axis = 1)

In [None]:
source_df.values.T.tolist()