In [5]:
import csv
import json
import os
import math
import openpyxl
import re

import pandas as pd
import numpy as np
from collections import Counter
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist


In [6]:
def readCSVFile(filePath):
    try:
        with open(filePath, 'r', newline='') as csvfile:
            has_header = csv.Sniffer().has_header(csvfile.read(1024))
            csvfile.seek(0)  # Rewind.
            dialect = csv.Sniffer().sniff(csvfile.readline(), [',',';'])
            csvfile.seek(0) 
            reader = csv.reader(csvfile, dialect)
            if(has_header):
                next(reader)  # Skip header row.
            dataset = pd.DataFrame(reader)
        return dataset
        #print(filePath)
    except:
        print("Could not read CSV file",filePath)

In [7]:
def readExcel(filePath):
    dataset = pd.read_excel(filePath)
    return dataset

In [8]:
def custom_csv(fname):
    if fname.endswith((".data", ".csv")):
        return readCSVFile(fname)
    elif fname.endswith((".xlsx")):
        return readExcel(fname)

In [9]:
def getLabels(filePath):
    try:
        flag = 0
        dataset = custom_csv(filePath)
        n = dataset.iloc[:, -1].nunique(dropna=False)
        perc = dataset.iloc[:, -1].value_counts(normalize=True)*100
        if(len(perc) > len(dataset.iloc[:, 0].value_counts(normalize=True)*100)):  #checking whether 1st column is label
            n=dataset.iloc[:, 0].nunique(dropna=False)
            flag = 1
        if(flag == 1):
            return dataset.iloc[:, 0]
        else:
            return dataset.iloc[:,-1]
    except:
        print("Can not read last column items for", filePath)

In [10]:
def computeClassEntropy(filePath):
    dataset = custom_csv(filePath)
    classLabel = getLabels(filePath)
    entropy=0
    rows = readRows(filePath)
    uc = countUniqueLabels(filePath)
    values, counts = np.unique(classLabel, return_counts=True)
    for i in range(len(values)):
        p = counts[i] / rows
        entropy -= p * math.log(p,uc)
    return entropy

In [11]:
def readRows(filePath):
    try:
        dataset = custom_csv(filePath)
        n = len(dataset.axes[0])
        return n
    except:
        print("Can not read rows for",filePath)

In [12]:
def readColumns(filePath):
    try:
        dataset = custom_csv(filePath)
        n = len(dataset.axes[1])
        return n
    except:
        print("Can not read columns for",filePath)

In [13]:
def countUniqueLabels(filePath):
    try:
        dataset = custom_csv(filePath)
        n = dataset.iloc[:, -1].nunique(dropna=False)
        perc = dataset.iloc[:, -1].value_counts(normalize=True)*100
        if(len(perc) > len(dataset.iloc[:, 0].value_counts(normalize=True)*100)):  #checking whether 1st column is label
            n=dataset.iloc[:, 0].nunique(dropna=False)
        return n
    except:
        print("Can not read unique items for", filePath)

In [36]:
def computeCorrelation(filePath):
    sp=p=sn=n=0
    try:
        dataset = custom_csv(filePath)
        rows, cols = dataset.shape
        corr1 = dataset.corr() #Compute pairwise correlation of columns, excluding NA/null values.

        c1 = corr1.unstack()

        for i in c1:
            if(i==1):
                sp+=1
            elif(i==-1):
                sn+=1
            elif(i>0):
                p+=1
            elif(i<=0):
                n+=1
        corrDict = {}
        sp=sp/(cols*(cols-1))
        corrDict['spCorr'] = sp 
        p=p/(cols*(cols-1))
        corrDict['pCorr'] = p
        sn=sn/(cols*(cols-1))
        corrDict['snCorr'] = sn 
        n=n/(cols*(cols-1))
        corrDict['nCorr'] = n
        
        return corrDict

    except:
        print("Can not compute correlation for", filePath)

In [15]:
def computeClassOverlap(filePath):
    m = 0
    s = 0 
    count = 0
    outlier = 0
    flag = 0
    dataset = custom_csv(filePath)
    print("Coming here")
    km = KMeans(n_clusters = countUniqueLabels(filePath))
    clusters = km.fit_predict(dataset)
    # points array will be used to reach the index easy
    points = np.empty((0,len(dataset.axes[1])), float)
    # distances will be used to calculseetate outliers
    distances = np.empty((0,len(dataset.axes[0])), float)   
        # getting points and distances
    centroids = km.cluster_centers_
    for i, center_elem in enumerate(centroids):
            # cdist is used to calculate the distance between center and other points
        distances = np.append(distances, cdist([center_elem],dataset[clusters == i], 'euclidean')) 
        points = np.append(points, dataset[clusters == i], axis=0)
        
    cluster_distance_d = {'cluster':clusters, 'distance':distances}
    cluster_distance = pd.DataFrame(cluster_distance_d)

    grouped = cluster_distance.groupby(['cluster'], as_index = False)
    cluster_statistics = grouped[['distance']].agg([np.mean, np.std]) 
    
    for i in range(len(cluster_distance)):#
        for j in range(len(cluster_statistics)):
            if(cluster_statistics.index[j]==cluster_distance.iloc[i,0]):
                m = cluster_statistics.iloc[j,0]
                s =cluster_statistics.iloc[j,1]
                flag=1
                break
            if(flag==1):
                if(cluster_distance.iloc[i,1] > (m + 3 * s)):
                    outlier+=1
                    for k in range(len(cluster_statistics)):
                        if(cluster_statistics.index[k]!=cluster_distance.iloc[i,0]):
                            dist = cdist([points[i]], [centroids[k]], 'euclidean')
                            m1 = cluster_statistics.iloc[k,0]
                            s1 = cluster_statistics.iloc[k,1]
                            if(dist <= (m1 + 3 * s1)):
                                count+=1
        
    #print(count)
    #print(outlier)
    return [count/(dataset.shape[0] * dataset.shape[1]), outlier/(dataset.shape[0] * dataset.shape[1])]

In [16]:
def completeness(filePath):
    dataset = custom_csv(filePath)
    totalMissing = dataset.isnull().sum().sum()
    return (totalMissing/(len(dataset.axes[1]) * len(dataset.axes[0])))

In [17]:
def classimbalanceRatio(filePath):
    dataset = custom_csv(filePath)
    totalClasses = countUniqueLabels(filePath)
    perc = dataset.iloc[:, -1].value_counts(normalize=True)*100
    if(len(perc) > len(dataset.iloc[:, 0].value_counts(normalize=True)*100)):
        perc=dataset.iloc[:, 0].value_counts(normalize=True)*100
    count = 0
    for idx, item in enumerate(perc):
        for j in perc[idx+1:]:
            if(abs(item-j) > 30):
                count+=abs(item-j)
    #print("count",(count))
    return (count/(dataset.shape[0]*dataset.shape[1]))

In [18]:
def conciseness(filePath):
    dataset = custom_csv(filePath)
    uniques = dataset.drop_duplicates(keep='first')
    return (1 - (uniques.shape[0] * uniques.shape[1]) /(dataset.shape[0] * dataset.shape[1]))

In [19]:
def typeCheck(singleCol):
    ci=cs=co=cf=cd=cu=0
    intType = re.compile(r"^\d+$")
    dateType1 = re.compile(r"[0-9]{4}[-/][0-9]?[0-9]?[-/][0-9]?[0-9]?")
    dateType2 = re.compile(r"[0-9]?[0-9]?[-/][0-9]?[0-9]?[-/][0-9]{4}")
    stringType = re.compile("^[a-zA-Z]+.*\s*[a-zA-Z]*$")
    floatType = re.compile(r"[-+]?[0-9]*\.?[0-9]*")
    uriType = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")

    for i in range(len(singleCol)):
        if((uriType.match(str(singleCol[i])))):
            cu+=1
        elif(stringType.match(str(singleCol[i]))):
            cs+=1
        elif((intType.match(str(singleCol[i]) ))):
            ci+=1
        elif(dateType1.match(str(singleCol[i]) or dateType2.match(str(singleCol[i])))):
            cd+=1
        elif(floatType.match(str(singleCol[i]))):
            cf+=1
        else:
            co+=1
    daConsidered=['int','str','float','date','uri','other']
    #overall=[ci,cs,cf,cd,cu,co]
    if(cf > ci):             #column with float values, int gets assigned to ci, coverting it to cf
        cf = cf+ci
        ci=0
    #return overall.index(max(overall))
    overall=[ci,cs,cf,cd,cu,co]

    return max(overall)

In [20]:
def syntaxAccuracy(filePath):
    dataset = custom_csv(filePath)
    count = 0
    invalid = 0
    for i in range((dataset.shape[1])):
        flag=0
        if(dataset.iloc[:, i].dtype == "object"):
            count = typeCheck(dataset.iloc[:, i])
            if(count != dataset.shape[0]):
                invalid+=1
    return (invalid/dataset.shape[1])

In [37]:
listofFiles={}
for path, subdirs, files in os.walk(os.getcwd()+'/datasets/numeric datasets'):
    for name in files:
        if name.endswith((".data", ".csv", ".xlsx")):
            listofFiles[name]=os.path.join(path, name)
       # elif name.endswith((".xls", ".xlsx")):
        #    listofExcelFiles[name]=os.path.join(path, name)

#for key in listofCSVFiles:
 #   readCSVFile(listofCSVFiles[key])

#for key in listofExcelFiles:
 #   readExcel(listofExcelFiles[key])

corrDict = {}
dataCharQuality = {}
count = []
for eachFile in listofFiles:
    dataCharQuality[eachFile] = {}
    print(eachFile)
    dataCharQuality[eachFile]['instances'] = readRows(listofFiles[eachFile])
    dataCharQuality[eachFile]['attributes'] = readColumns(listofFiles[eachFile])
    dataCharQuality[eachFile]['uniqueClasses'] = countUniqueLabels(listofFiles[eachFile])
    dataCharQuality[eachFile]['entropy'] = computeClassEntropy(listofFiles[eachFile])
    corrDict = computeCorrelation(listofFiles[eachFile])
    #print(dataCharacteristics[eachFile]['entropy'])
   # entropyDataframe = groupByColumnEntropy(listofDataFiles[eachFile])
    if(corrDict):
        dataCharQuality[eachFile].update(corrDict)

    count = computeClassOverlap(listofFiles[eachFile])
    dataCharQuality[eachFile]['classOverlap'] =  count[0]
    dataCharQuality[eachFile]['outlierDetection'] = count[1]
    dataCharQuality[eachFile]['completeness'] = completeness(listofFiles[eachFile])
    dataCharQuality[eachFile]['imbalanceRatio'] = classimbalanceRatio(listofFiles[eachFile])
    dataCharQuality[eachFile]['conciseness'] = conciseness(listofFiles[eachFile])
    dataCharQuality[eachFile]['syntaxAccuracy'] = syntaxAccuracy(listofFiles[eachFile])


data_banknote_authentication.csv
Can not compute correlation for /home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/bank note authentication/data_banknote_authentication.csv
Coming here
wine.data
Can not compute correlation for /home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data
Coming here
heart_failure_clinical_records_dataset.csv
Can not compute correlation for /home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/heart failure/heart_failure_clinical_records_dataset.csv
Coming here
divorce.xlsx
Coming here


In [25]:
print((listofFiles))

{'data_banknote_authentication.csv': '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/bank note authentication/data_banknote_authentication.csv', 'wine.data': '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/wine/wine.data', 'heart_failure_clinical_records_dataset.csv': '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/heart failure/heart_failure_clinical_records_dataset.csv', 'divorce.xlsx': '/home/d19125691/Documents/Experiments/ontologyDCQ/onto-DCQ-FS/datasets/numeric datasets/divorce/divorce.xlsx'}


In [30]:
with open("dataquality.json","w") as f:
    json.dump(dataCharQuality,f)

In [7]:
#Not used this
def kmeans(X,k,max_iterations=100):
    '''
    X: multidimensional data
    k: number of clusters
    max_iterations: number of repetitions before clusters are established
    
    Steps:
    1. Convert data to numpy aray
    2. Pick indices of k random point without replacement
    3. Find class (P) of each data point using euclidean distance
    4. Stop when max_iteration are reached of P matrix doesn't change
    
    Return:
    np.array: containg class of each data point
    '''
    if isinstance(X, pd.DataFrame):X = X.values
    idx = np.random.choice(len(X), k, replace=False)
    centroids = X[idx, :]
    P = np.argmin(distance.cdist(X, centroids, 'euclidean'),axis=1)
    for _ in range(max_iterations):
        centroids = np.vstack([X[P==i,:].mean(axis=0) for i in range(k)])
        tmp = np.argmin(distance.cdist(X, centroids, 'euclidean'),axis=1)
        if np.array_equal(P,tmp):break
        P = tmp
    return P