### A360AI
Ensure that the A360AI object is loaded

In [None]:
a360ai

This is the project associated with the workspace.

In [None]:
a360ai.project_name

This is the workspace flavor. It can be any one of 'spark', 'scipy', 'pytorch', 'tensorflow', 'huggingface', or 'xgboost'.

In [None]:
import os
image_flavor = os.environ["A360_IMAGE"].split("-")
if image_flavor[-2] == "huggingface":
    flavor = "huggingface"
else:
    flavor = image_flavor[-1]
flavor

### Data Repos
Data Repos are A360AI objects used to read and write datasets. A project can be associated with multiple datarepos but a datarepo is always associated with a single project.

List the datarepos associated with the current project.

In [None]:
datarepos = a360ai.list_datarepos()

if datarepos.shape[0] == 0:
    raise Exception(f"No data repos associated with the project {a360ai.project_name}. Please add a datarepo.")
datarepos

#### Select a datarepo
Please set the *datarepo_name* variable to a valid string. The ``set_default_datarepo`` method needs to be used in order to access the datasets stored in the specific datarepo.

In [None]:
### Can use "Health Byte - Dev"

datarepo_name = <DATAREPO_NAME> 
a360ai.set_default_datarepo(datarepo_name)

List the datasets stored in the current datarepo.

In [None]:
datasets = a360ai.list_datasets()

if datasets.shape[0] == 0:
    raise Exception(f"No datasets associated with the datarepo {datarepo_name}. Please add a dataset.")
datasets

#### Select the model type

Please specify the type of task to be performed. Options are limited to "classification" and "regression".

In [None]:
model_type = <MODEL_TYPE> 

#### Select the dataset
Please set the *dataset_name* variable to a valid string. 

In [None]:
### For huggingface, "clickbait_data_200.csv" (classification) or 
### "essay_scoring.csv" (regression) can be used.

dataset_name = <DATASET_NAME> 

df = a360ai.load_dataset(dataset_name)

#### Specify the target column 
Please set the target column out of the following columns

In [None]:
df.columns.values.tolist()

In [None]:
### "clickbait" if using "clickbait_data_200.csv"
### "score" if using "essay_scoring.csv"

target_column = <TARGET_COLUMN> 

#### Please provide the number of classes
If the model_type is classification, please specify the number of classes present. An integer should be assigned to the variable ``num_classes``.

In [None]:
### 2 if using "clickbait_data_200.csv"

num_classes = <NUM_CLASSES>

#### Split the dataset into train/test data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

X, y = df.drop(target_column,axis=1), df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)
    
num_features = X_train.shape[1]

### A360AI Model

Create/retrieve an A360AI model. The following arguments need to be passed to the ``get_or_create_model`` method: model_name, version and model_type (default is "classification").

In [None]:
from uuid import uuid4
unique = str(uuid4())[-6:]

model = a360ai.get_or_create_model(
    model_name=f"Sample Notebook {flavor} {model_type} - {unique}", 
    version="1.0", 
    model_type=model_type
)

### A360AI Experiment

Create/retrieve an A360AI experiment. The following arguments need to be passed to the ``get_or_create_experiment`` method of the Model object: experiment_name, model_flavor, train_features and train_target. 

For huggingface models, the additional parameters need to be provided: pipeline_name, framework and tokenizer. The only framework currently supported for huggingface is "pt" (pytorch).

In [None]:
import pandas as pd

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 
train_features = X_train
train_target = y_train

X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)
    
if model_type == "classification":
    X_train, y_train = X_train_df, y_train_df
    X_test = pd.DataFrame(X_test)
    y_test = pd.DataFrame(y_test)
        
    train_features = X_train_df
    train_target = y_train_df
    
experiment = model.get_or_create_experiment(
    experiment_name=f"sample_notebook_experiment{unique}",
    model_flavor = flavor, 
    train_features = train_features,
    train_target = train_target,
    pipeline_name = "text-classification",
    tokenizer = tokenizer,
    framework = "pt"
)

#### View the experiments associated with the model

In [None]:
model.list_experiments()

#### Choose the candidate model to be trained.

This model will be trained on the train dataset and evaluated on the test dataset. The candidate model will also be logged later after it has been trained.

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertForSequenceClassification
    
class Data(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
        
    def __getitem__(self, index):      
        item = {key: torch.tensor(val[index]) for key, val in self.x.items()}
        item['labels'] = torch.tensor(self.y[index])
        return item
    
    def __len__(self):
        return len(self.y)

column_name = X_train.columns[0]
    
if model_type == "classification":
    X_train = list(X_train[column_name].values)
    X_test = list(X_test[column_name].values)
else:
    X_train = list(X_train[column_name])
    X_test = list(X_test[column_name])
    
if model_type == "classification":
    y_train = list(y_train.values)
    y_test = list(y_test.values)
else:
    y_train = list(y_train)
    y_test = list(y_test)
        
train_encodings = tokenizer(X_train, truncation=True, padding=True,max_length=512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=512, return_tensors="pt")

train_dataset = Data(train_encodings, y_train)
test_dataset = Data(test_encodings, y_test)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
if model_type == "classification":
    num_labels = 2
else:
    num_labels = 1
        
candidate_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

candidate_model.to(device)
candidate_model.train()

### Create an A360AI Run

Use the Run to log the trained model, metadata, metrics and hyperparameters.

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error

from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import mean_squared_error, accuracy_score
    
optimizer = AdamW(candidate_model.parameters(), lr=5e-5)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=True)

train_loss = []
        
num_epochs = 2
    
with experiment.run_experiment() as run:

    for epoch in range(num_epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            train_outputs = candidate_model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = train_outputs[0]
            loss.backward()
            optimizer.step()
        train_loss.append(loss)    
        print("epoch  {}\t loss : {}".format(epoch,loss))
            
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        test_outputs = candidate_model(input_ids, attention_mask=attention_mask)
        y_pred_outputs = test_outputs.logits.detach().numpy()
        
    if model_type == "classification":
        import numpy as np
        y_prediction = np.argmax(y_pred_outputs, axis=1)
        test_score = accuracy_score(labels, y_prediction)
    else:
        test_score = mean_squared_error(labels, y_pred_outputs)
            
    metrics = {
        "test_score": test_score,  
    }
        
    run.log_metadata({
        "notes": "This model was generated with extreme care."
    })
    run.log_metrics(metrics)
    run.log_hyperparameters({
        "foo": 1
    })
    run.log_model(candidate_model)

#### View the runs associated with the experiment. 
If the code above has been executed successfully, there will be a new entry in the dataframe returned by the ``list_runs`` method.

In [None]:
experiment.list_runs()