<a href="https://colab.research.google.com/github/arangoml/arangopipe/blob/master/examples/R_Example_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Overview

This notebook provides an overview of using Arangopipe with your R projects. In this notebook, a simple illustrative example of using the arangopipe package to store meta-data about model development activity done using R is provided. To run this notebook, first install the notebook extension to R with jupyter using:
```conda install -c r r-irkernel```

The cells below provide the step-by-step instructions to develop a regression model for the california housing dataset using R and then using Arangopipe to store the meta-data about the results. 

In [None]:
# Install Required packages for reading the data file
install.packages("readr",repos = "http://cran.rstudio.com/")
install.packages("RCurl", repos = "http://cran.rstudio.com/")

### load the library and read the data file

In [None]:
library(readr)
library(RCurl)
fp <- "https://raw.githubusercontent.com/arangoml/arangopipe/master/arangopipe/tests/CItests/cal_housing.csv"
df <- read.csv(fp)

### List the data types

In [None]:
str(df)

### Transform the response variable (don't run the next cell twice!)

In [None]:
# don't run this cell twice, otherwise you will be applying the log transform multiple times.
df$medianHouseValue = log(df$medianHouseValue)

### Generate the test and train datasets

In [None]:
smp_size <- floor(0.667 * nrow(df))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df)), size = smp_size)

df.train <- df[train_ind, ]
df.test <- df[-train_ind, ]

### Inspect the training dataset

In [None]:
head(df.train)

### Develop the linear model

In [None]:
lm.housing <- lm(medianHouseValue ~ ., data = df.train)

### Generate the test and training predictions

In [None]:
trng.pred <- predict(lm.housing, df.train)
test.pred <- predict(lm.housing, df.test)
rmse.trng <- sqrt((sum(df.train$medianHouseValue - trng.pred)^2)/nrow(df.train))
rmse.test <- sqrt((sum(df.test$medianHouseValue - test.pred)^2)/nrow(df.test))

### Summarize the model developed

In [None]:
summary(lm.housing)

### Set up to save the model meta-data to Arangopipe by installing the reticulate library

In [None]:
install.packages("reticulate")


In [None]:
library("reticulate")
miniconda_update(path = miniconda_path())


1. Load the library
2. Set up a python environment for this project (mini-conda)
3. Install Arangopipe and dependencies in the environment

In [None]:
conda_create("r-reticulate")
py_install(env = "r-reticulate", packages = c("arangopipe==0.0.70.0.0",
                                              "python-arango","pandas",
                                              "PyYAML==5.1.1", "sklearn2",
                                              "yapf", "autopep8"),pip = TRUE)

In [None]:
system("git clone -b r_example_arangopipe https://github.com/arangoml/arangopipe.git")
#

### Use a python connector to set up an Arangopipe connection

In [None]:
conn_params <-list()
conn_params$DB_service_host <- "arangoml.arangodb.cloud"
conn_params$DB_end_point <- "createDB"
conn_params$DB_service_name <- "createDB"
conn_params$DB_service_port <- '8529'
conn_params$conn_protocol <- 'https'

In [None]:
conn_params

In [None]:
source_python('arangopipe/examples/arangopipe_conn.py', convert = TRUE)

In [None]:
apcon <- conn_arangopipe(conn_params)

In [None]:
ap <- apcon$ap
ap_admin <- apcon$ap_admin

In [None]:
proj_info <- list()
proj_info$name <- "R_Arangopipe_Connection_Test"
proj_reg <- ap_admin$register_project(proj_info)

In [None]:
proj_reg

### Register the dataset

In [None]:
ds_info <- list("name" = paste("california-housing-dataset", Sys.time(), sep = "-"),
            "description" = "This dataset lists median house prices in Califoria. Various house features are provided",
           "source" = "UCI ML Repository" )

In [None]:
ds_reg <- ap$register_dataset(ds_info)

In [None]:
ds_reg

### Generate the featureset meta-data 

In [None]:
f.info <- sapply(df, class)

In [None]:
f.info["name"] <- paste("logTransformedFeatureset", Sys.time(),sep="-")


In [None]:
f.info <- as.list(f.info)

### Register the featureset

In [None]:
fs_reg <- ap$register_featureset(f.info, ds_reg$`_key`)

### Generate the model meta-data

In [None]:
model_info <- list()
model_info["name"] <- paste("R_Linear_Regression_Model_Housing_Data", Sys.time(),sep="-")


### Register the model meta-data

In [None]:
model_reg <- ap$register_model(model_info, project = "R_Arangopipe_Connection_Test")

### Set up the data structures to capture modeling meta-data summary

In [None]:
run_info = list()

In [None]:
b1 = ISOdate(2020,11,13)
b2 = Sys.time()
uuid <- as.character(as.integer(difftime(b2,b1,units='mins')))
run_info["run_id"] <- uuid

In [None]:
model.params.data = list()
model.params.data["name"] = "Linear_Model"
model.params.data["Intercept"] = "True"

model.params = list()
model.params$`run_id` = uuid
model.params$`model_params` = model.params.data

In [None]:
ms <- summary(lm.housing)
model.perf.summary <- list()
model.perf.summary["run_id"] = uuid
model.perf.summary["r.squared"] = ms$r.squared
model.perf.summary["adj.r.squared"] = ms$adj.r.squared
model.perf.summary["timestamp"] = Sys.time()

In [None]:
model.perf.summary

In [None]:
run_info["dataset"] = ds_reg$`_key`
run_info["featureset"] = fs_reg$`_key`
run_info["model"] = model_reg$`_key`
run_info$`model-params` = model.params
run_info$`model-perf` =  model.perf.summary
run_info["tag"] = "R_Arangopipe_Connection_Test"
run_info["project"] = "R_Arangopipe_Connection_Test"

### Log the model meta-data

In [None]:
ri <- ap$log_run(run_info)

### We are done!  You can set up your R projects to use Arangopipe in a similar manner.