In [233]:
from rdflib import Graph, Literal, RDFS, RDF, URIRef, XSD
import pandas
from SPARQLWrapper import SPARQLWrapper, JSON
import os
from utils import conv2list_train, conv2list_test
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [219]:
def prune(graph):
    """
    Removes all the literals from the graph.
    """
    filtered_graph = Graph()
    for s , p, o in graph:
        if not isinstance(o, Literal):
            filtered_graph.add((s,p,o))
    return filtered_graph

In [220]:
def generate_query_template(j):
    """
    Generates SPARQL templates to be used during fetching the path.
    Args:
        j: length of the path.
    eg:
        for j = 1
            SELECT ?p1
            WHERE{
                ?v0 ?p1 ?v1 .
            }
        for j = 2
            SELECT ?p1 ?p2
            WHERE{
                ?v0 ?p1 ?v1 .
                ?v1 ?p2 ?v2 .
            }
    """
    query_templates = []

    for i in range(j):
        select = "SELECT"
        triples = ""
        for k in range(i+1):
            select += f" ?p{k+1}"
            triples += f"\t?v{k} ?p{k+1} ?v{k+1} .\n\t"
        query = f"""
        {select}
        WHERE {{
            {triples}
        }}
        """
        query_templates.append(query)
    
    return query_templates


In [221]:
def prune_result(results):
    """
    Removes all the basic predicates, such as subclass, range, domain and type
    from the specific paths retrived
    Args:
        result: Result of the SPARQL query which contains the path
    """
    predicates = []
    terminology_predicates = [RDFS.subClassOf, RDFS.range, RDFS.domain, RDF.type]
    for result in results:
        if not any(i in terminology_predicates for i in result ):
            result = [str(i) for i in result]
            predicates.append(result)

    
    return predicates

In [222]:
def path_finding(graph,s,o):
    """
    finds the paths between subject and object
    Args:
        graph : Reference graph
        s: target subject
        o: target object
    """
    k = 3
    for j in range(1,k+1):
        query_templates = generate_query_template(j)
        for query_template in query_templates:
            query = query_template.replace("?v0",f"{s}")
            query = query.replace(f"?v{j}", f"{o}")
            results = graph.query(query)
            paths = prune_result(results)
    return paths

In [223]:
# Creating Graph
graph = Graph()
graph.parse("reference-kg.nt", format="nt")
print(f"Len Before Pruning {len(graph)}")
graph = prune(graph)
print(f"Len After Pruning {len(graph)}")

Len Before Pruning 675859
Len After Pruning 660000


In [224]:
def train():
    """
    Extracts data from training file
    returns:
        X: a list of dictionaries
    """
    train = conv2list_train()
    X = []
    for row in train:
        s = f"<{row[2]}>"
        o = f"<{row[4]}>"
        paths = path_finding(graph,s,o)
        path_counter = dict(Counter(map(tuple,paths)))
        X.append(path_counter)
    return X



In [225]:
X_train = train()
y_train = [i[1] for i in conv2list_train()]
print(len(y_train), len(X))

1000 1000


In [226]:
class Ensemble:
    """
    Combining 3 different models with 3 different feature extraction methods
    Models : Logistic Regression, RandomForest, KNeighbors
    Feature_extraction: MutliLabelBinarizer, DictVectorizer, CountVectorizer
    """
    def fit(self,X,y):
        """
        Transforms the data accordingly for all 3 feature extraction methods
        and implements fit() method for all the models
        X : training independent features
        y : training dependent feature 
        """
        self.vec = DictVectorizer()
        X_train = self.vec.fit_transform(X)
        self.lr = LogisticRegression()
        self.lr.fit(X_train, y)

        flattened_data = [list(d.keys()) for d in X]
        self.encoder = MultiLabelBinarizer()
        X_train = self.encoder.fit_transform(flattened_data)
        self.kn = KNeighborsClassifier()
        self.kn.fit(X_train,y)

        flattened_data = [" ".join(["_".join(key) for key in d.keys()]) for d in X]
        self.vectorizer = CountVectorizer()
        X_train = self.vectorizer.fit_transform(flattened_data)
        self.rf = RandomForestClassifier()
        self.rf.fit(X_train,y)
    
    def voting(self):
        """
        Takes votes from all 3 models and calulates the final prediction based on
        count.
        """
        preds = []
        for i in range(len(self.predictions[0])):
            pred = [self.predictions[0][i], self.predictions[1][i], self.predictions[2][i]]
            if pred.count(1) > pred.count(0):
                preds.append(1)
            else:
                preds.append(0) 
        
        return preds
    
    def predict(self,X):
        """
        predicts the dependent feature for all the models.
        """
        X_test = self.vec.transform(X)
        pred_lr = self.lr.predict(X_test)

        flattened_data = [list(d.keys()) for d in X]
        X_test = self.encoder.transform(flattened_data)
        pred_kn = self.kn.predict(X_test)

        flattened_data = [" ".join(["_".join(key) for key in d.keys()]) for d in X]
        X_test = self.vectorizer.transform(flattened_data)
        pred_rf = self.rf.predict(X_test)

        self.predictions = [pred_lr,pred_kn,pred_rf]
        return self.voting()

In [227]:
# Fitting our model
model = Ensemble()
model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [246]:
def test():
    """
    Extracts data from testing file
    returns:
        X: a list of dictionaries
    """
    test = conv2list_test()
    X = []
    for row in test:
        s = f"<{row[1]}>"
        o = f"<{row[3]}>"
        paths = path_finding(graph,s,o)
        path_counter = dict(Counter(map(tuple,paths)))
        X.append(path_counter)
    statements = [i[0] for i in test]
    return X, statements

def save_result(pred, statements):
    """
    Creates a Graph and saves the result in test_result.ttl file
    Args:
        pred : predictions from our model
        statements: rdf:statements from our testing turtle file.
    """
    print(len(pred), len(statements))
    test_graph = Graph() 
    predicate = URIRef("http://swc2017.aksw.org/hasTruthValue")
    for i in range(len(statements)):
        test_graph.add((statements[i], predicate,Literal(pred[i], datatype=XSD.double)))
    
    test_graph.serialize("test_result.ttl",format='ttl')

In [247]:
# Making prediction
X_test, statements = test()
pred = model.predict(X_test)





In [248]:
# Storing the result
save_result(pred, statements)

500 500
