In [1]:
import pandas as pd
import numpy as np

import os
import ntpath
import re
import time

from stat import *
from rdflib import *
from rdflib.namespace import *
from langdetect import detect
from datetime import datetime
from collections import Counter
from langdetect import DetectorFactory, detect  #to enforce consistent results


import networkx as nx
import matplotlib.pyplot as plt

In [2]:
#Namespace declaration
prefix = URIRef("http://www.csv2rdf.org/2020/dq#")
csvw = URIRef("http://www.w3.org/ns/csvw#")

In [3]:
#datatype RE
intType = re.compile(r"^\d+$")
dateType1 = re.compile(r"[0-9]{4}[-/][0-9]?[0-9]?[-/][0-9]?[0-9]?")
dateType2 = re.compile(r"[0-9]?[0-9]?[-/][0-9]?[0-9]?[-/][0-9]{4}")
stringType = re.compile("^[a-zA-Z]+.*\s*[a-zA-Z]*$")
floatType = re.compile(r"[-+]?[0-9]*\.?[0-9]*")
uriType = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")

In [18]:
#Predcting column datatype based on highest occurance of value
def typeCheck(singleCol):
    ci=cs=co=cf=cd=cu=0
    singleCol.fillna("$#", inplace = True)    #replace all NA with special characters
    for i in range(len(singleCol)):
        if((uriType.match(str(singleCol[i])))):
            cu+=1
        elif(stringType.match(str(singleCol[i]))):
            cs+=1
        elif((intType.match(str(singleCol[i]) ))):
            ci+=1
        elif(dateType1.match(str(singleCol[i]) or dateType2.match(str(singleCol[i])))):
            cd+=1
        elif(floatType.match(str(singleCol[i])) and singleCol[i]!='$#' ):
            cf+=1
        else:
            co+=1
    daConsidered=['int','str','float','date','uri','other']
    overall=[ci,cs,cf,cd,cu,co]
    di=zip(daConsidered, overall)
    #actDatatype=max(di)[0]
    if cf > ci :             #column with float values, int gets assigned to ci, coverting it to cf
        cf = cf+ci
        ci=0
    return overall.index(max(overall))

In [5]:
#Detecting language in case the column is string
def detectLang(singleCol):
    DetectorFactory.seed = 0
    lang = []   
    for r in singleCol:
        lang.append(detect(r))
    c=Counter(lang)
    totalItems=len(lang)
    return(c.most_common(1)[0][0],(c.most_common(1)[0][1]/totalItems)*100)

In [6]:
#No typos in date
def is_valid_date(year, month, day):
    day_count_for_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    if year%4==0 and (year%100 != 0 or year%400==0):
        day_count_for_month[2] = 29
    return (1 <= month <= 12 and 1 <= day <= int(day_count_for_month[month]))

In [7]:
#Count all NA's
def countNA(singleCol):
    resultedCounter = Counter(singleCol)
    return (resultedCounter[' ']+singleCol.isna().sum())

In [8]:
#Detecting valid date 
def checkDate(singleCol):
    year = pd.DatetimeIndex(singleCol).year  
    month = pd.DatetimeIndex(singleCol).month
    day = pd.DatetimeIndex(singleCol).day
    validDate = []
    for i in range(len(year)):
        validDate.append(is_valid_date(year[i], month[i], day[i]))
    trueCount = sum(validDate)
    if(trueCount == len(year)):
        return True
    else:
        return False

In [9]:
#Calculating mean and std
def truncate(n, decimals=0):
    multiplier = 10 ** decimals
    return int(n * multiplier) / multiplier
def meanStd(columnValue):
    return truncate(np.mean(columnValue),2),truncate(np.std(columnValue),2)

In [122]:
#inclusion of blank node & typed literals
g = Graph()

flag=0
countlongURI = 0 #counting Long URI if any 
count=0   #count triples
completeness = 0  #population completeness in the file
completeFlag = 0

dataset = prefix+"dataset"
fileSize = prefix+"fileSize"
fileMod = prefix+"fileModified"
fformat = DC.format
hasValue = prefix+"hasValue"
hasDatatype = prefix+"hasDatatype"
hasColumn = prefix+"hasColumn"
uriLength = prefix+"urilength" #URI Length 

#change file name for all other input files
data = pd.read_csv(add file path) 

rowclass=prefix+"rows"
columnClass=prefix+"columns"

#declaring all rows as ROW class
for k in range((data.shape[0])):                 
    rowprefix=prefix+"R"+str(k)
    g.add((rowprefix, RDF.type, rowclass))
    count+=1
    
#adding column datatype & declaring all columns as column class
for i in range((data.shape[1])):
    flag=0
    columnName = prefix + data.columns[i]
    completeness = countNA(data.iloc[:,i])
    
    if(completeness!=0):
        pc = (1 - completeness / data.shape[0])
        g.add((columnName, prefix+"populationCompleteness", Literal(pc, datatype=XSD.decimal)))
        count+=1
        completeFlag = 1
    coldt = typeCheck(data.iloc[:, i])
    
    if(coldt==0):
        datatype = XSD.integer
        mean,std = meanStd(data.iloc[:, i])
        flag=1
    elif(coldt==1):
        datatype = XSD.string
        flag=2
    elif(coldt==2):
        datatype = XSD.decimal
    elif(coldt==3):
        datatype = XSD.dateTime
        flag=3
    elif(coldt==4):
        datatype=XSD.anyURI
    else:
        datatype = XSD.Literal

    g.add((columnName, hasDatatype, datatype))
    g.add((columnName, RDF.type, columnClass))
    count=count+2   
    
    if(flag==1):
        g.add((columnName, prefix+"mean",Literal(mean, datatype=XSD.decimal) ))
        g.add((columnName, prefix+"stdDev",Literal(std, datatype=XSD.decimal) ))
        count+=2
    if(flag==2 and completeFlag != 1):
        lang, percentage = detectLang(data.iloc[:,i])
        g.add((columnName, prefix+"language", Literal(lang, datatype=XSD.token)))
        count+=1
    if(flag==3):
        validDate = checkDate(data.iloc[:,i])
        if(validDate != True):
            g.add((columnName, prefix+"invalidDate", Literal(True, datatype=XSD.boolean)))
            count+=1

for i in range(len(data)):
    rowprefix=prefix+"R"+str(i)
    for k in range(len(data.columns)):
        value=data.iloc[i][k]
        columnName = prefix + data.columns[k]
        anode = BNode()
        if(len(rowprefix)>60):
            countlongURI = countlongURI+1
            g.add((rowprefix, uriLength, Literal(len(rowprefix), datatype=XSD.integer)))
            count=count+1
        if(len(columnName)>60):
            countlongURI = countlongURI+1
            g.add((columnName, uriLength, Literal(len(columnName), datatype=XSD.integer)))
            count=count+1
        g.add((rowprefix, columnName, anode))
        count=count+1
        if(value=="$#"):
            value=RDF.nil
            g.add((anode, hasValue, Literal(value)))
            count=count+1
        elif((intType.match(str(value)))):
            g.add((anode, hasValue, Literal(value, datatype=XSD.integer)))
            count=count+1
        elif((dateType1.match(str(value))) or (dateType2 .match(str(value)))):
            g.add((anode, hasValue, Literal(value, datatype=XSD.datetime)))
            count=count+1
        elif((uriType.match(str(value)))):
            g.add((anode, hasValue, Literal(value, datatype=XSD.anyURI)))
            count=count+1
        elif((stringType.match(str(value)))):
            g.add((anode, hasValue, Literal(value, datatype=XSD.string)))
            count=count+1
        elif((floatType.match(str(value)))):
            g.add((anode, hasValue, Literal(value, datatype=XSD.decimal)))
            count=count+1
        else:
            g.add((anode, hasValue, Literal(value, datatype=XSD.Literal)))
            count=count+1
          

#change file name 
try:
    st = os.stat("add file path")
except IOError:
    print("failed to get information about")
else: 
    g.add((dataset, fileSize, Literal(st[ST_SIZE])))
    g.add((dataset, fileMod, Literal(time.asctime(time.localtime(st[ST_MTIME])))))
    fform = "text/csv"
    g.add((dataset, DC[format], Literal('text/csv') ))
    count=count+3

        
#detection of long URI - RC1
if(countlongURI > 0):
    g.add((dataset, longURI, Literal(countlongURI, datatype=XSD.integer)))
    count=count+1
    
#change file name
g.serialize(addfilename.rdf, format="n3")
#o.serialize("onto.rdf", format="n3")
print(g.serialize(format='n3').decode("utf-8"))

275
13
9
@prefix ns1: <http://www.csv2rdf.org/2020/dq#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ns1:Address1 a ns1:columns ;
    ns1:hasDatatype xsd:string ;
    ns1:language "en"^^xsd:token .

ns1:Address2 a ns1:columns ;
    ns1:hasDatatype xsd:string ;
    ns1:language "pl"^^xsd:token .

ns1:Address3 a ns1:columns ;
    ns1:hasDatatype xsd:string ;
    ns1:populationCompleteness 0.6153846153846154 .

ns1:Authority a ns1:columns ;
    ns1:hasDatatype xsd:string ;
    ns1:language "en"^^xsd:token .

ns1:BranchName a ns1:columns ;
    ns1:hasDatatype xsd:string ;
    ns1:language "en"^^xsd:token .

ns1:Email a ns1:columns ;
    ns1:hasDatatype xsd:string .

ns1:OBJECTID a ns1:columns ;
    ns1:hasDatatype xsd:integer ;
    ns1:mean 7.23 ;
    ns1:stdDev 4.06 .

ns1:R0 a ns1:rows ;
    ns1:Address1 [ ns1: