### 

In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from rdflib import Graph

In [2]:
path = '../dataset/'
#path = '../'

In [3]:
g = Graph()
g.parse(path + "TKG_1.ttl", format="ttl")

<Graph identifier=Nc8873d369fac4b099337170ea8469ee3 (<class 'rdflib.graph.Graph'>)>

In [4]:
from rdflib.plugins.sparql.processor import SPARQLResult

def sparql_results_to_df(results: SPARQLResult) -> pd.DataFrame:
    """
    Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
    using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
    """
    return pd.DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in results),
        columns=[str(x) for x in results.vars],
    )

In [5]:
query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ex: <http://example/#> 
PREFIX treatment_drug: <http://example/Treatment_Drug#>
    
select distinct ?treatment
where {
    #?treatment rdf:type ex:Treatment .
    ?treatment ex:belong_to ex:effective .
    }
    """

qres = g.query(query)
df_effective = sparql_results_to_df(qres)
df_effective['predicate'] = 'ex:belong_to'
df_effective['object'] = 'ex:effective .'

In [6]:
df_effective

Unnamed: 0,treatment,predicate,object
0,http://example/Treatment/treatment399,ex:belong_to,ex:effective .
1,http://example/Treatment/treatment400,ex:belong_to,ex:effective .
2,http://example/Treatment/treatment401,ex:belong_to,ex:effective .
3,http://example/Treatment/treatment402,ex:belong_to,ex:effective .
4,http://example/Treatment/treatment403,ex:belong_to,ex:effective .
...,...,...,...
144,http://example/Treatment/treatment543,ex:belong_to,ex:effective .
145,http://example/Treatment/treatment544,ex:belong_to,ex:effective .
146,http://example/Treatment/treatment545,ex:belong_to,ex:effective .
147,http://example/Treatment/treatment546,ex:belong_to,ex:effective .


In [7]:
query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ex: <http://example/#> 
PREFIX treatment_drug: <http://example/Treatment_Drug#>
    
select distinct ?treatment
where {
    #?treatment rdf:type ex:Treatment .
    ?treatment ex:belong_to ex:low_effect .
    }
    """

qres = g.query(query)
df_decrease_effectiveness = sparql_results_to_df(qres)
df_decrease_effectiveness['predicate'] = 'ex:belong_to'
df_decrease_effectiveness['object'] = 'ex:low_effect .'

In [8]:
df_category = pd.concat([df_effective, df_decrease_effectiveness])
df_category['treatment'] = '<' + df_category['treatment'].astype(str) + '>'
display(df_category.head(2), df_category.shape)

Unnamed: 0,treatment,predicate,object
0,<http://example/Treatment/treatment399>,ex:belong_to,ex:effective .
1,<http://example/Treatment/treatment400>,ex:belong_to,ex:effective .


(548, 3)

In [9]:
df_category.reset_index(inplace=True)
df_category.drop(columns=['index'], inplace=True)
display(df_category.head(2), df_category.shape)

Unnamed: 0,treatment,predicate,object
0,<http://example/Treatment/treatment399>,ex:belong_to,ex:effective .
1,<http://example/Treatment/treatment400>,ex:belong_to,ex:effective .


(548, 3)

# Cross-Validation

In [10]:
def cross_validation(dataset):
    test_set = []
    training_set = []
    X = dataset
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    kf.get_n_splits(X)
    
    split = 1
    for train_index, test_index in kf.split(X):
        test = X.iloc[test_index]
        test['object'] = test['object'].str.replace(' .', '')
        test.to_csv(path + 'test_'+str(split)+'.ttl', sep='\t', index=False, header=False)
        create_training_set(split)
        split+=1

In [11]:
def create_training_set(split):
    with open(path + 'test_'+str(split)+'.ttl', "r") as f:
        lines_test = f.readlines()
    with open(path + "TKG_1.ttl", "r") as f:
        lines_data = f.readlines()
    with open(path + 'train_'+str(split)+'.ttl', "w") as f:
        for line in lines_data:
            line = line.replace(' .', '')
            if line in  lines_test or '@prefix' in line:
                continue
            else:                
                f.write(line)

In [12]:
cross_validation(df_category)

  test['object'] = test['object'].str.replace(' .', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['object'] = test['object'].str.replace(' .', '')
