In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
import matlab.engine as engi
import matlab as mat
import math
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from scipy.io import loadmat
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from sys import stdout
from sklearn.metrics import pairwise_kernels
from sklearn.cluster import DBSCAN

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path

eng = engi.start_matlab()
eng.addpath(r'matlab_CTKCCA/',nargout=0)

In [2]:
data_source = 'data/All'
if platform.system() == 'Darwin' or platform.system() == 'Linux':
    _dir = data_source + '/'
else:
    _dir = data_source + '\\'

datasets = [(join(_dir, f)) for f in listdir(_dir) if  not Path(join(_dir, f)).is_dir()]

In [3]:
def load_data(path):
    df = pd.read_csv(path)
    print(path)
    df_columns = df.columns
    _temp = []
    for col in df_columns:
        col = col.replace('$','')
        col = col.replace('>','')
        col = col.replace('<','')
        _temp.append(col)
    df.columns = _temp
    defect_values = df[df.columns[-1]].unique()
    d = {}
    if 'F' in defect_values:
        d = {'T': True, 'F': False}
        df[df.columns[-1]] = df[df.columns[-1]].map(d)
    return df

def transform_data(source_df,target_df):
    mat_source_df = mat.double(source_df.values.T.tolist())
    mat_target_df = mat.double(target_df.values.T.tolist())
    X = eng.CTKCCA(mat_source_df,mat_target_df,nargout=4)
    train_X,train_y = np.array(X[0]),np.array(X[1]).tolist()[0]
    test_X,test_y = np.array(X[2]),np.array(X[3]).tolist()[0]
    return train_X,train_y,test_X,test_y

In [None]:
results_dist = {}
results_p = {}
results = []
for s_project in datasets:
    results_dist[s_project] = {}
    results_p[s_project] = {}
    for d_project in datasets:
        if s_project == d_project:
            continue
        source_df = load_data(s_project)
        target_df = load_data(d_project)
        if source_df.shape[1] != target_df.shape[1]:
            continue
        source_df = source_df[source_df.columns[:len(source_df.columns)-1]]
        target_df = target_df[target_df.columns[:len(target_df.columns)-1]]
        dist = pairwise_distances(source_df,target_df,metric='cosine')
        x = []
        for _dist in dist:
            x.append(np.median(_dist))
        x = np.median(x)
        results_dist[s_project][d_project] = x
df = pd.DataFrame.from_dict(results_dist,orient='index')

In [5]:
results_dist = pd.read_csv('results/MMD/MMD_100_full.csv',index_col=0)
df_MMD_updated = results_dist.abs()
df_MMD_updated.values[[np.arange(df_MMD_updated.shape[0])]*2] = 0

  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.2, min_samples=5,metric='precomputed').fit(df_MMD_updated)
clustering.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0])

In [7]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=None,affinity='precomputed',
                                     distance_threshold=0.6,linkage='average').fit(df_MMD_updated)
clustering.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:
from sklearn.cluster import SpectralClustering
clustering = SpectralClustering(n_clusters=4,affinity='precomputed').fit(df_MMD_updated)
clustering.labels_

array([3, 1, 1, 3, 0, 1, 1, 1, 1, 1, 1, 1, 3, 0, 1, 1, 3, 1, 1, 3, 3, 3,
       1, 1, 1, 2, 3, 0, 0, 0, 3, 1, 3, 3, 1, 3, 3, 0, 1, 0, 0, 1, 3, 1,
       1, 3, 0, 0, 1, 1, 3, 1, 3, 3, 0, 3, 3, 0, 1, 0, 3, 0, 1, 3, 1, 3,
       0, 3, 3, 3, 0, 0, 1, 0, 0, 1, 1, 3, 3, 1, 3, 3, 1, 1, 1, 1, 3, 0,
       0, 3, 1, 3, 1, 0, 3, 0, 3, 0, 1, 1, 1, 0, 3, 1, 1, 1, 1, 1, 1, 3,
       1, 0, 0, 1, 0, 3, 1, 1, 1, 1, 3], dtype=int32)

In [16]:
clusters = zip(df_MMD_updated.index.tolist(),clustering.labels_.tolist())

In [17]:
result = {}
for cluster in clusters:
    if cluster[1] not in result.keys():
        result[cluster[1]] = []
    result[cluster[1]].append(cluster[0])

In [18]:
result

{0: ['ActionBarSherlock',
  'Android-Orma',
  'DaggerMock',
  'Digital',
  'Discord4J',
  'Easer',
  'JSONassert',
  'MaterialScrollBar',
  'Much-Assembly-Required',
  'OpenRefine',
  'Sudachi',
  'android-test',
  'android-transcoder',
  'androidannotations',
  'app-icon',
  'arara',
  'archaius',
  'chunky',
  'dcevm',
  'digdag',
  'druid',
  'easygcm',
  'facebook-android-sdk',
  'fdb-record-layer',
  'freeline',
  'getdown',
  'guice',
  'intellij-elixir',
  'intellij-plugin-save-actions',
  'jBrowserDriver',
  'jboss-eap-quickstarts',
  'jedis',
  'metacat',
  'nakadi',
  'nokogiri',
  'nzbhydra2',
  'openrouteservice',
  'org.alloytools.alloy',
  'pacbot',
  'phpinspectionsea',
  'pojobuilder',
  'qpython',
  'react-native-background-job',
  'roboguice',
  's3proxy',
  'scoop',
  'sofa-tracer',
  'swipe-button',
  'tikxml'],
 1: ['Android-ObservableScrollView',
  'EclipseCodeFormatter',
  'Elephant',
  'FreeBuilder',
  'Hystrix',
  'RustDT',
  'SuperListview',
  'amidst',
  'and