# Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
import matlab.engine as engi
import matlab as mat
import math
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.io import loadmat
import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

ModuleNotFoundError: No module named 'SMOTE'

# Start matlab service

In [None]:
eng = engi.start_matlab()
eng.addpath(r'matlab_CTKCCA/',nargout=0)
eng.addpath(r'matlab_KS/',nargout=0)

# variables

In [5]:
source_data_path = 'data/1385/converted/aff4.csv'
target_data_path = 'data/1385/converted/scite-ru.csv'

result_path = 'result/result.csv'
repeats = 20
ratio = 0.1
lrank = 70
reg = 1E-5

# Data loading and Normalizing Data

In [6]:
def load_data(path,source):
    df = pd.read_csv(path)
    df = df.drop(labels = ['Host','Vcs','Project','File','PL','IssueTracking'],axis=1)
    df = df.dropna()
    df = df[['TLOC', 'TNF', 'TNC', 'TND', 'LOC', 'CL', 'NStmt', 'NFunc',
    'RCC', 'MNL', 'avg_WMC', 'max_WMC', 'total_WMC', 'avg_DIT', 'max_DIT',
    'total_DIT', 'avg_RFC', 'max_RFC', 'total_RFC', 'avg_NOC', 'max_NOC',
    'total_NOC', 'avg_CBO', 'max_CBO', 'total_CBO', 'avg_DIT.1',
    'max_DIT.1', 'total_DIT.1', 'avg_NIV', 'max_NIV', 'total_NIV',
    'avg_NIM', 'max_NIM', 'total_NIM', 'avg_NOM', 'max_NOM', 'total_NOM',
    'avg_NPBM', 'max_NPBM', 'total_NPBM', 'avg_NPM', 'max_NPM', 'total_NPM',
    'avg_NPRM', 'max_NPRM', 'total_NPRM', 'avg_CC', 'max_CC', 'total_CC',
    'avg_FANIN', 'max_FANIN', 'total_FANIN', 'avg_FANOUT', 'max_FANOUT',
    'total_FANOUT', 'NRev', 'NFix', 'avg_AddedLOC', 'max_AddedLOC',
    'total_AddedLOC', 'avg_DeletedLOC', 'max_DeletedLOC',
    'total_DeletedLOC', 'avg_ModifiedLOC', 'max_ModifiedLOC',
    'total_ModifiedLOC','Buggy']]
    d = {'buggy': True, 'clean': False}
    df['Buggy'] = df['Buggy'].map(d)
    if source:
        df = apply_smote(df)
    return df
def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

In [7]:
source_df = load_data(source_data_path,False)
target_df = load_data(target_data_path,False)

FileNotFoundError: [Errno 2] File b'data/1385/converted/aff4.csv' does not exist: b'data/1385/converted/aff4.csv'

# Matlab integration
## Matlab integration - CTKCCA

In [25]:
def transform_data(source_df,target_df):
    mat_source_df = mat.double(source_df.values.T.tolist())
    mat_target_df = mat.double(target_df.values.T.tolist())
    X = eng.CTKCCA(mat_source_df,mat_target_df,nargout=4)
    train_X,train_y = np.array(X[0]),np.array(X[1]).tolist()[0]
    test_X,test_y = np.array(X[2]),np.array(X[3]).tolist()[0]
    return train_X,train_y,test_X,test_y
trasformed_train_X,trasformed_train_y,trasformed_test_X,trasformed_test_y = transform_data(source_df,target_df)
train_df = pd.DataFrame(trasformed_train_X)
train_df['Buggy'] = trasformed_train_y
train_df = apply_smote(train_df)
trasformed_train_y = train_df.Buggy
trasformed_train_X = train_df.drop('Buggy',axis = 1)

In [28]:
clf = LogisticRegression()
clf.fit(trasformed_train_X,trasformed_train_y)
predicted = clf.predict(trasformed_test_X)
print(classification_report(trasformed_test_y, predicted))

              precision    recall  f1-score   support

         0.0       0.66      0.67      0.67        52
         1.0       0.15      0.14      0.15        21

    accuracy                           0.52        73
   macro avg       0.41      0.41      0.41        73
weighted avg       0.51      0.52      0.52        73



## Matlab integration - KS

In [29]:
def transform_data(source_df,target_df):
    mat_source_df = mat.double(source_df.values.T.tolist())
    mat_target_df = mat.double(target_df.values.T.tolist())
    X = eng.HDP_KS(mat_source_df,mat_target_df,nargout=4)
    train_X,train_y = np.array(X[0]),np.array(X[1]).tolist()[0]
    test_X,test_y = np.array(X[2]),np.array(X[3]).tolist()[0]
    return train_X,train_y,test_X,test_y
trasformed_train_X,trasformed_train_y,trasformed_test_X,trasformed_test_y = transform_data(source_df,target_df)
train_df = pd.DataFrame(trasformed_train_X)
train_df['Buggy'] = trasformed_train_y
train_df = apply_smote(train_df)
trasformed_train_y = train_df.Buggy
trasformed_train_X = train_df.drop('Buggy',axis = 1)

In [32]:
clf = LogisticRegression()
clf.fit(trasformed_train_X,trasformed_train_y)
predicted = clf.predict(trasformed_test_X)
print(classification_report(trasformed_test_y, predicted))

              precision    recall  f1-score   support

         0.0       0.70      0.31      0.43        52
         1.0       0.28      0.67      0.39        21

    accuracy                           0.41        73
   macro avg       0.49      0.49      0.41        73
weighted avg       0.58      0.41      0.42        73



# Teting using original Data

## get train test data

In [None]:
def get_train_test_data(source_df,target_df):
    train_y = source_df.Buggy
    train_X = source_df.drop('Buggy',axis = 1)
    test_y = target_df.Buggy
    test_X = target_df.drop('Buggy',axis = 1)
    return train_X,train_y,test_X,test_y

In [None]:
train_X,train_y,test_X,test_y = get_train_test_data(source_df,target_df)

In [None]:
clf = LogisticRegression()
clf.fit(train_X,train_y)
predicted = clf.predict(test_X)
print(classification_report(test_y, predicted))

In [None]:
train_y[train_y == True].shape,train_y[train_y == False].shape

In [None]:
test_y[test_y == True].shape,test_y[test_y == False].shape