### **Import Dataset**

In [0]:
! mkdir datasets
! wget https://uncloud.univ-nantes.fr/index.php/s/r6W7oixMM48P59k/download
! mv download datasets/catalogue.tsv

--2019-12-06 21:02:50--  https://uncloud.univ-nantes.fr/index.php/s/r6W7oixMM48P59k/download
Resolving uncloud.univ-nantes.fr (uncloud.univ-nantes.fr)... 193.52.104.60, 2001:660:7220:386:193:52:104:60
Connecting to uncloud.univ-nantes.fr (uncloud.univ-nantes.fr)|193.52.104.60|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 72971322 (70M) [application/octet-stream]
Saving to: ‘download’


2019-12-06 21:02:55 (19.5 MB/s) - ‘download’ saved [72971322/72971322]



### **Explore Dataset**

In [0]:
import pandas as pd
import numpy as np

In [0]:
list_parser = lambda x: x[1:-1].split(',')
df = pd.read_csv("datasets/catalogue.tsv",
                        sep="\t",
                        converters={'keywords': list_parser,
                                    "concepts": list_parser})
# This is added in case initial dataset hasn't the right columns names:
df.columns = ['id', 'title', 'language', 'type', 'keywords', 'concepts']
df.head(5)

Unnamed: 0,id,title,language,type,keywords,concepts
0,1228,"Data, information, design and traffic injuries...",en,mp4,"[design, know, information design, people, ...",['http://en.wikipedia.org/wiki/Information_des...
1,4796,Uncertain Allies,en,pdf,"[north korea, korea, china, north, pyongya...","['http://en.wikipedia.org/wiki/North_Korea', ..."
2,6930,Classification of Web Documents Using a Graph-...,en,mp4,"[subgraph, graph, document, contrast, clas...","['http://en.wikipedia.org/wiki/Hello', 'http:..."
3,7867,Cell and Molecular Neurobiology,en,pdf,"[edition, academic press, molecular, 2nd ed...",['http://en.wikipedia.org/wiki/Massachusetts_I...
4,8160,Advanced Fluid Dynamics of the Environment,en,pdf,"[fluid, eddy viscosity, result velocity, fl...","['http://en.wikipedia.org/wiki/Homework', 'ht..."


In [0]:
def MeanAndMedianArrayLength(df, colname):
    mean = 0
    median_tab = []
    for i in range(0, len(df)):
        add = len(df.loc[i,colname])
        mean += add
        median_tab.append(add)
    mean = mean / len(df)
    print("mean of " + colname + " numbers : " + str(mean))
    median_tab = np.sort(np.array(median_tab))
    middle = int(len(median_tab)/2)
    if len(median_tab) % 2 == 0:
        print("median of " + colname + " numbers : " + str((median_tab[middle-1] + median_tab[middle]) / 2) + "\n")
    else:
        print("median of " + colname + " numbers : " + str(median_tab[middle]) + "\n")

# mean concept and keywords number per document
MeanAndMedianArrayLength(df, "concepts")
MeanAndMedianArrayLength(df, "keywords")

mean of concepts numbers : 10.03375
median of concepts numbers : 10.0

mean of keywords numbers : 19.80065
median of keywords numbers : 20.0



In [0]:
# Eliminate languages different than "en"
non_english = df[df.language != "en"].index.tolist()
columns = df.columns
new_df = []
for i in range(0, len(df)):
  if i not in non_english:
    new_df.append(df.loc[i].tolist())
df = pd.DataFrame(new_df, columns = columns) 

In [0]:
df.head(-5)

Unnamed: 0,id,title,language,type,keywords,concepts
0,1228,"Data, information, design and traffic injuries...",en,mp4,"[design, know, information design, people, ...",['http://en.wikipedia.org/wiki/Information_des...
1,4796,Uncertain Allies,en,pdf,"[north korea, korea, china, north, pyongya...","['http://en.wikipedia.org/wiki/North_Korea', ..."
2,6930,Classification of Web Documents Using a Graph-...,en,mp4,"[subgraph, graph, document, contrast, clas...","['http://en.wikipedia.org/wiki/Hello', 'http:..."
3,7867,Cell and Molecular Neurobiology,en,pdf,"[edition, academic press, molecular, 2nd ed...",['http://en.wikipedia.org/wiki/Massachusetts_I...
4,8160,Advanced Fluid Dynamics of the Environment,en,pdf,"[fluid, eddy viscosity, result velocity, fl...","['http://en.wikipedia.org/wiki/Homework', 'ht..."
...,...,...,...,...,...,...
95191,102981,2.5 Motion Equations for Constant Acceleration...,en,html,"[size, rsub size, rsub, size rsub, rsup, ...","['http://en.wikipedia.org/wiki/Bracket', 'htt..."
95192,103078,Test Prep for AP® Courses,en,html,"[adhesion, cohesion, water, water molecule,...","['http://en.wikipedia.org/wiki/Canning', 'htt..."
95193,110586,Practice Test,en,html,"[near tenth, tenth, inch, near, foot, ang...",['http://en.wikipedia.org/wiki/Practice_of_law...
95194,113136,Working the Net,en,html,"[manufacturers, search engine, supplier, se...","['http://en.wikipedia.org/wiki/Use', 'http://..."


In [0]:
df['title'] = df['title'].apply(lambda x : x.replace("\n", " "))
df['title'] = df['title'].apply(lambda x : x.replace("\"", "\\n"))

In [0]:
df[df.id == 1]['title'].iloc[0]

'Slovenian Cinematheque, Vinegar Syndrome (On Problems of Film Tape Preservation) '

### **How to request the fuseki's API**

First install the module SPARQLWrapper with anaconda's terminal :
- *Choose the right environnement*
- *Write this piece of code : 'pip install SPARQLWrapper'*
- *Documentation : https://rdflib.github.io/sparqlwrapper/*

Next you need to deploy the Fuseki's server :
- *Go to "/fuseki_server/" folder*
- *Launch the "/fuseki-server.bat*

In [0]:
!pip install SPARQLWrapper



In [0]:
from SPARQLWrapper import SPARQLWrapper, JSON

server_URL = "http://185.157.246.81:3030/x5gon/"

# function which create

sparql = SPARQLWrapper(server_URL)

In [0]:

string_constructor = "PREFIX dcterms: <http://purl.org/dc/terms/>\nINSERT DATA{"
for i in range(0, len(df)):

  # id
  string_constructor += "\n   <http://x5gon/bjk/{}> dcterms:identifier {} ;".format(df.loc[i, "id"],df.loc[i, "id"])
  # type
  string_constructor += "\n       dcterms:format {} ;".format('"' + df.loc[i, "type"] + '"')
  # titre
  string_constructor += "\n       dcterms:title {} ;".format('"' + df.loc[i, "title"] + '"')
  # keywords
  keywords = df.loc[i, "keywords"]
  for j in range(0, len(keywords)):
    if(j > 0):
      string_constructor += "\n       dcterms:subject {} ;".format('"' + keywords[j][1:] + '"')
    else:
      string_constructor += "\n       dcterms:subject {} ;".format('"' + keywords[j] + '"')
  # concept
  concepts = df.loc[i, "concepts"]
  for j in range(0, len(concepts)):
    URI = concepts[j].split("/")
    URI = URI[len(URI)-1]
    URI = URI[0:len(URI)-1]
    string_constructor += "\n       dcterms:concept {} ;".format('<' + "http://dbpedia.org/resource/" + URI + '>')
  # language
  string_constructor += "\n       dcterms:language {} .".format('"' + df.loc[i, "language"] + '"')
  if(i%200 == 0):
    string_constructor += "\n}"
    #print(string_constructor)
    sparql.setQuery(string_constructor)
    sparql.setReturnFormat(JSON)

    results = sparql.query().convert()
    string_constructor = "PREFIX dcterms: <http://purl.org/dc/terms/>\nINSERT DATA{"
  



In [0]:
print(string_constructor[:10000])

PREFIX dcterms: <http://purl.org/dc/terms/>
INSERT DATA{


In [0]:
sparql.setQuery(string_constructor)
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result["label"]["value"])



QueryBadFormed: ignored