## Forge Configuration

In [None]:
# conda activate kgforge
import getpass
import yaml
import pandas as pd
import numpy as np
import nexussdk as nxs
from kgforge.core import KnowledgeGraphForge
from kgforge.core import Resource

In [None]:
ORGANIZATION = "fusion-adulbrich" # input your fusion org
PROJECT = "Flower-Classification" # input your newly created fusion project
DEPLOYMENT = "https://staging.nexus.ocp.bbp.epfl.ch/v1"

In [None]:
TOKEN = getpass.getpass()

In [None]:
context = {
  "@id": "https://context.org",
  "@context": {
    "@vocab": "https://staging.nexus.ocp.bbp.epfl.ch/v1/vocabs/",
    "schema": "http://schema.org/",
    "prov": "http://www.w3.org/ns/",
    "description": {
      "@id": "http://schema.org/description"
    },
    "name": {
      "@id": "http://schema.org/name"
    },
    "used": {
      "@id": "http://www.w3.org/ns/prov#used"
    },
    "generated": {
      "@id": "http://www.w3.org/ns/prov#generated"
    }
  }
}

In [None]:
nxs.config.set_environment(DEPLOYMENT)
nxs.config.set_token(TOKEN)
#nxs.resources.create(ORGANIZATION, PROJECT, context) #to run the first time if no context exists in your project

In [None]:
config = {
    "Model": {
        "name": "RdfModel",
        "origin": "store",
        "source": "BlueBrainNexus",
        "context": {
            "iri": "https://context.org",
            "bucket": f"{ORGANIZATION}/{PROJECT}"
        }
    },
    "Store": {
        "name": "BlueBrainNexus",
        "endpoint": DEPLOYMENT,
        "versioned_id_template": "{x.id}?rev={x._store_metadata._rev}",
        "file_resource_mapping": "https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/configurations/nexus-store/file-to-resource-mapping.hjson",
    }
}

In [None]:
forge = KnowledgeGraphForge(config, token=TOKEN, bucket=f"{ORGANIZATION}/{PROJECT}")

## Step 1: Data Analysis

In [None]:
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
dataset = pandas.read_csv('iris.csv')
dataset

In [None]:
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.savefig('boxplot.png')

In [None]:
dataset.hist()
plt.savefig('histogram.png')

In [None]:
scatter_matrix(dataset)
plt.savefig('scatter_matrix.png')

### Register Datasets and Activity for Step 1

In [None]:
# input of step 1, 2, 3
myfile = forge.attach("./iris.csv")
mydataset = Resource(type="http://schema.org/Dataset", name="Iris Dataset 1" , distribution=myfile)
forge.register(mydataset)

In [None]:
# output of step 1
myboxplot = forge.attach("./boxplot.png")
myboxplotdataset = Resource(type="http://schema.org/Dataset", name="Iris Boxplot" , distribution=myboxplot)
forge.register(myboxplotdataset)

In [None]:
# output of step 1
myhistogram = forge.attach("./histogram.png")
myhistogramdataset = Resource(type="http://schema.org/Dataset", name="Iris Histogram" , distribution=myhistogram)
forge.register(myhistogramdataset)

In [None]:
# output of step 1
myscatter = forge.attach("./scatter_matrix.png")
myscatterdataset = Resource(type="http://schema.org/Dataset", name="Iris Scatter Matrix" , distribution=myscatter)
forge.register(myscatterdataset)

In [None]:
# activity of step 1: link manually in Fusion
myfirstactivity = Resource(type="http://www.w3.org/ns/prov#Activity", name="Data Analysis", used=mydataset, generated=[myboxplotdataset,myhistogramdataset,myscatterdataset])
forge.register(myfirstactivity) #does not work if the resource has an array of type [..., ...]

## Step 2: Build Models

### Train and Test Sets

In [None]:
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

### Models Definition and Running

In [None]:
scoring = 'accuracy'

In [None]:
%%capture cap --no-stderr
models = []
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=500)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
 kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
 cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
 results.append(cv_results)
 names.append(name)
 msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
 print(msg)

In [None]:
with open('models.txt', 'w') as f:
    f.write(cap.stdout)

### Register Datasets and Activity for Step 2

In [None]:
# input of step 2
myconfigdataset = Resource(type="http://schema.org/Dataset", name="Models Configuration", seed=seed, validation_size=validation_size, scoring=scoring)
forge.register(myconfigdataset)

In [None]:
# output of step 2, input of step 3
mymodels = forge.attach("./models.txt")
mymodelsdataset = Resource(type="http://schema.org/Dataset", name="Model Accuracies", distribution=mymodels)
forge.register(mymodelsdataset)

In [None]:
# activity of step 2: link manually in Fusion
mysecondactivity = Resource(type="http://www.w3.org/ns/prov#Activity", name="Models Generation", used=[mydataset,myconfigdataset], generated=mymodelsdataset) 
forge.register(mysecondactivity)

## Step 3: Validation

In [None]:
%%capture cap --no-stderr
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
with open('knn-results.txt', 'w') as f:
    f.write(cap.stdout)

In [None]:
%%capture cap --no-stderr
lr = LogisticRegression(solver='lbfgs', max_iter=500)
lr.fit(X_train, Y_train)
predictions = lr.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
with open('lr-results.txt', 'w') as f:
    f.write(cap.stdout)

In [None]:
%%capture cap --no-stderr
svn = SVC()
svn.fit(X_train, Y_train)
predictions = svn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
with open('svn-results.txt', 'w') as f:
    f.write(cap.stdout)

### Register Datasets and Activity for Step 3

In [None]:
# output of step 3
myknn = forge.attach("./knn-results.txt")
myknndataset = Resource(type="http://schema.org/Dataset", name="kNN Results", distribution=myknn)
forge.register(myknndataset)

In [None]:
# output of step 3
mylr = forge.attach("./lr-results.txt")
mylrdataset = Resource(type="http://schema.org/Dataset", name="Logistic Regression Results", distribution=mylr)
forge.register(mylrdataset)

In [None]:
# output of step 3
mysvn = forge.attach("./svn-results.txt")
mysvndataset = Resource(type="http://schema.org/Dataset", name="SVN Results", distribution=mysvn)
forge.register(mysvndataset)

In [None]:
# activity of step 3: link manually in Fusion
mythirdactivity = Resource(type="http://www.w3.org/ns/prov#Activity", name="Results of Best Models", used=[mydataset,myconfigdataset,mymodelsdataset], generated=[myknndataset, mylrdataset, mysvndataset]) 
forge.register(mythirdactivity)

In [None]:
# replace ID with the ID of your dataset in Nexus Fuson
# you can go to your workflow step, to the input tab, and find the dataset previously uploaded
# click on copy id
temp = forge.retrieve(id="https://staging.nexus.ocp.bbp.epfl.ch/v1/resources/fusion-adulbrich/Flower-Classification/_/9f6f0f6c-7331-4023-a0cb-aa3561846794")
temp
forge.download(temp, "http://schema.org/distribution.http://schema.org/contentUrl", "./")