In [1]:
import pandas as pd
import numpy as np

import os
import ntpath
import re
import time

from stat import *
from rdflib import *
from rdflib.namespace import *
from langdetect import detect
from datetime import datetime
from collections import Counter
from langdetect import DetectorFactory, detect  #to enforce consistent results

import csv
import re
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
pattern = re.compile(r'[^a-zA-Z0-9]') #removing non-alpha numeric characters from file name

In [3]:
#datatype RE
intType = re.compile(r"^\d+$")
dateType1 = re.compile(r"[0-9]{4}[-/][0-9]?[0-9]?[-/][0-9]?[0-9]?")
dateType2 = re.compile(r"[0-9]?[0-9]?[-/][0-9]?[0-9]?[-/][0-9]{4}")
stringType = re.compile("^[a-zA-Z]+.*\s*[a-zA-Z]*$")
floatType = re.compile(r"[-+]?[0-9]*\.?[0-9]*")
uriType = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")

In [4]:
#Predcting column datatype based on highest occurance of value
def typeCheck(singleCol):
    ci=cs=co=cf=cd=cu=0
    singleCol.fillna("$#", inplace = True)    #replace all NA with special characters
    for i in range(len(singleCol)):
        if((uriType.match(str(singleCol[i])))):
            cu+=1
        elif(stringType.match(str(singleCol[i]))):
            cs+=1
        elif((intType.match(str(singleCol[i]) ))):
            ci+=1
        elif(dateType1.match(str(singleCol[i]) or dateType2.match(str(singleCol[i])))):
            cd+=1
        elif(floatType.match(str(singleCol[i])) and singleCol[i]!='$#' ):
            cf+=1
        else:
            co+=1
    daConsidered=['int','str','float','date','uri','other']
    overall=[ci,cs,cf,cd,cu,co]
    di=zip(daConsidered, overall)
    #actDatatype=max(di)[0]
    if cf > ci :             #column with float values, int gets assigned to ci, coverting it to cf
        cf = cf+ci
        ci=0
    return overall.index(max(overall))

In [5]:
#Detecting language in case the column is string
def detectLang(singleCol):
    DetectorFactory.seed = 0
    lang = []   
    for r in singleCol:
        lang.append(detect(r))
    c=Counter(lang)
    totalItems=len(lang)
    return(c.most_common(1)[0][0],(c.most_common(1)[0][1]/totalItems)*100)

In [6]:
#No typos in date
def is_valid_date(year, month, day):
    day_count_for_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    if year%4==0 and (year%100 != 0 or year%400==0):
        day_count_for_month[2] = 29
    return (1 <= month <= 12 and 1 <= day <= int(day_count_for_month[month]))

In [7]:
#Count all NA's
def countNA(singleCol):
    resultedCounter = Counter(singleCol)
    return (resultedCounter[' ']+singleCol.isna().sum())

In [8]:
#Detecting valid date 
def checkDate(singleCol):
    year = pd.DatetimeIndex(singleCol).year  
    month = pd.DatetimeIndex(singleCol).month
    day = pd.DatetimeIndex(singleCol).day
    validDate = []
    for i in range(len(year)):
        validDate.append(is_valid_date(year[i], month[i], day[i]))
    trueCount = sum(validDate)
    if(trueCount == len(year)):
        return True
    else:
        return False

In [9]:
#Calculating mean and std
def truncate(n, decimals=0):
    multiplier = 10 ** decimals
    return int(n * multiplier) / multiplier
def meanStd(columnValue):
    return truncate(np.mean(columnValue),2),truncate(np.std(columnValue),2)

In [10]:
CSVW = Namespace("http://www.w3.org/ns/csvw#")
CSVTORDF = Namespace("https://purl.archive.org/domain/csvtordf#")

g = Graph()

g.parse("/home/d19125691/Experiments/Experiments/csv2rdf/csv2rdfworking/csvtordf.owl")
g.bind("csvw", CSVW)
g.bind("csvtordf", CSVTORDF)

In [11]:
def identify_header(path, n=5, th=0.9):
    df1 = pd.read_csv(path, header='infer', nrows=n)
    df2 = pd.read_csv(path, header=None, nrows=n)
    sim = (df1.dtypes.values == df2.dtypes.values).mean()
    return 1 if sim < th else 0


def getSchema(csv_file):
    columnNames = []
    if(identify_header(csv_file)):  
        nocolumnheader = 1
        columnNames = pd.read_csv(csv_file).columns
    else:
        totalcolumns = pd.read_csv(csv_file).shape[1]
        columnNames = []
        for i in range(totalcolumns):
            columnNames.append("Attr"+str(i+1))

    return nocolumnheader, columnNames

In [12]:
csv_file = "/home/d19125691/Experiments/Experiments/csv2rdf/csv2rdfworking/datasets/agri.csv"

nocolumnheader = 0

df = pd.read_csv(csv_file)
nocolumnheader, columnNames = getSchema(csv_file)

cleanedNames = []
for cN in columnNames:
    cleanedNames.append(pattern.sub('', cN))

columnNames = cleanedNames

with open(csv_file, mode='r') as file:
    reader = csv.reader(file)
    if(nocolumnheader):
        header = next(reader)
    rows = [row for row in reader]  # Remaining rows are data

file_name = os.path.basename(csv_file)

fileName = file_name[0]

In [13]:
count=0   #count triples
flag=0
countlongURI = 0 #counting Long URI if any 
count=0   #count triples
completeness = 0  #population completeness in the file
completeFlag = 0

for i in range((df.shape[1])):
    flag=0
    column_uri = CSVW.Column+"#"+columnNames[i]
    completeness = countNA(df.iloc[:,i])
    
    if(completeness!=0):
        pc = round((1 - completeness / df.shape[0]),2)
        g.add((column_uri, CSVTORDF.populationCompleteness, Literal(pc, datatype=XSD.float)))
        count+=1
        completeFlag = 1
    coldt = typeCheck(df.iloc[:, i])
    
    if(coldt==0):
        datatype = XSD.integer
        mean,std = meanStd(df.iloc[:, i])
        flag=1
    elif(coldt==1):
        datatype = XSD.string
        flag=2
    elif(coldt==2):
        datatype = XSD.decimal
    elif(coldt==3):
        datatype = XSD.dateTime
        flag=3
    elif(coldt==4):
        datatype=XSD.anyURI
    else:
        datatype = XSD.string

    g.add((column_uri, CSVW.datatype, datatype))
    g.add((column_uri, RDF.type, CSVW.Column))
    count=count+2   
    
    if(flag==1):
        g.add((column_uri, CSVTORDF.mean,Literal(mean, datatype=XSD.float) ))
        g.add((column_uri, CSVTORDF.stdDev,Literal(std, datatype=XSD.float) ))
        count+=2
    if(flag==2 and completeFlag != 1):
        lang, percentage = detectLang(df.iloc[:,i])
        g.add((column_uri, CSVTORDF.language, Literal(lang)))
        count+=1
    if(flag==3):
        validDate = checkDate(df.iloc[:,i])
        if(validDate != True):
            g.add((column_uri, CSVTORDF.invalidDate, Literal(True, datatype=XSD.boolean)))
            count+=1

In [14]:
for rownum, row in enumerate(rows, start=1):
    row_uri = CSVW.Row+f"={rownum}"
    g.add((row_uri, RDF.type, CSVW.Row))
    g.add((row_uri, CSVW.rownum, Literal(rownum, datatype=XSD.int)))
    for colnum, value in enumerate(row, start=1):
        cell_uri = CSVW.Cell+f"={rownum}-"+columnNames[colnum-1]
        g.add((cell_uri, RDF.type, CSVW.Cell))
        g.add((row_uri, CSVTORDF.contains, cell_uri))
        if(value == ""):
            count=count+1
        elif(intType.match(str(value))):   
            g.add((cell_uri, CSVTORDF.hasValue, Literal(value, datatype=XSD.integer)))
            count=count+1
        elif((dateType1.match(str(value))) or (dateType2 .match(str(value)))):
            g.add((cell_uri, CSVTORDF.hasValue, Literal(value, datatype=XSD.datetime)))
            count=count+1
        elif((uriType.match(str(value)))):
            g.add((cell_uri, CSVTORDF.hasValue, Literal(value, datatype=XSD.anyURI)))
            count=count+1
        elif((stringType.match(str(value)))):
            g.add((cell_uri, CSVTORDF.hasValue, Literal(value, datatype=XSD.string)))
            count=count+1
        elif((floatType.match(str(value)))):
            g.add((cell_uri, CSVTORDF.hasValue, Literal(value, datatype=XSD.decimal)))
            count=count+1
        else:
            g.add((cell_uri, CSVTORDF.hasValue, Literal(value, datatype=XSD.string)))
            count=count+1

In [15]:
#print(g.serialize(format='n3').decode("utf-8"))

In [16]:
g.serialize("agri.rdf", format="xml")

In [16]:
print(g.serialize(format='n3').decode("utf-8"))

@prefix : <https://purl.archive.org/domain/csvtordf#> .
@prefix csvtordf: <https://purl.archive.org/domain/csvtordf#> .
@prefix csvw: <http://www.w3.org/ns/csvw#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://www.w3.org/ns/csvw#Column#Location_Comment> a csvw:Column ;
    csvw:datatype xsd:string ;
    csvtordf:populationCompleteness 0.17 .

<http://www.w3.org/ns/csvw#Column#Name> a csvw:Column ;
    csvw:datatype xsd:string ;
    csvtordf:language "en" .

<http://www.w3.org/ns/csvw#Column#OBJECTID> a csvw:Column ;
    csvw:datatype xsd:integer ;
    csvtordf:mean 9.5 ;
    csvtordf:stdDev 5.18 .

<http://www.w3.org/ns/csvw#Column#Streetview_Link> a csvw:Column ;
    csvw:datatype xsd:anyURI .

<http://www.w3.org/ns/csvw#Column#Type> a csvw:Column ;
    c