### TODO - CNN portion of project

1. Update metric to match those in paper (F1 score, precision, recall, AUC
2. Integrate phenotype dictionary to make selecting phenotype for Y-value in experiment easier (ie. not using a hard-coded integer)
3. Run experiments across all 10 phenotypes used in paper with default parameters
4. Repose this code
5. Add readMe
6. Create figure to compare F1 scores across phenotypes for the CNN

### Setup

In [None]:
# Mount into drive

from google.colab import drive
drive.mount("/content/drive")

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Project/src')

In [None]:
root = '/content/drive/MyDrive/Project'

In [None]:
import os
os.chdir(root)
%pwd

In [None]:
# Detect PY file updates and reload
%load_ext autoreload
%autoreload 0.5

In [None]:
%ls

In [None]:
%pwd

### Installations

In [None]:
!pip install wandb -qqq

In [None]:
import wandb
wandb.login()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import logging
import time
import h5py
from platform import python_version
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam, Adadelta
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from platform import python_version
from torch.utils import data

# Local imports
import src.CNN.CNN_NLP as cnn_model
from src.CNN.data_load import get_data
from src.CNN.run_model import run_model

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### Experiment

In [None]:
import torch.optim as optim

sweep_config = {
    'method': 'random', #grid, random
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'h5py_file': {
            'value':'src/phenotyping/their-embeddings/data-nobatch.h5'
        },
        'batch_size': {
            'values':[32, 64, 128]
        },
        'filter_sizes': {
            'value':[2, 3, 4, 5]
        },
        'num_filters': {
            'value':[100, 100,100,100]
        },
        'num_classes': {
            'value':2
        },
        'dropout': {
            'values': [0.3, 0.5]
        },
        'learning_rate': {
            'values': [1e-1, 1e-2, 1e-3]
        },
        'phenotype': {
            'value': 0
        },
        'epochs': {
            'values':[1]
        },
        'opt': {
            'values':['ada']
        },
        'rho':{
            'values':[0.9, 0.95]
        },
        'freeze_embeddings':{
            'values':[True]
        }
    }
}

def run():
  with wandb.init(project="cs6250-project", entity="cs7643-teamscam") as run:
    config = wandb.config
    
    # Parameters
    H5PY_FILE = config["h5py_file"]
    BATCH_SIZE = config["batch_size"]
    FILTER_SIZES = config["filter_sizes"]
    NUM_FILTERS = config["num_filters"]
    NUM_CLASSES = config["num_classes"]
    DROPOUT = config["dropout"]
    LEARNING_RATE = config["learning_rate"]
    RHO = config["rho"]
    PHENOTYPE = config["phenotype"]
    EPOCHS = config["epochs"]
    FREEZE_EMBEDDINGS = config["freeze_embeddings"]

    # Get Train and Validation DataLoader
    train_dataloader, val_dataloader, embeddings_tensor = get_data(H5PY_FILE, device, BATCH_SIZE, PHENOTYPE)

    # Instantiate CNN model
    model = cnn_model.CNN_NLP(pretrained_embedding=embeddings_tensor,
                        freeze_embedding=FREEZE_EMBEDDINGS,
                        vocab_size=None,
                        embed_dim=300,
                        filter_sizes=FILTER_SIZES,
                        num_filters=NUM_FILTERS,
                        num_classes=NUM_CLASSES,
                        dropout=0.5)
    
    # Send model to `device` (GPU/CPU)
    model.to(device)
    
    # Instantiate Optimizer
    if (config['opt'] == 'adam'): 
      optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    else:
      optimizer = optim.Adadelta(model.parameters(), lr=LEARNING_RATE, rho=RHO)

    # Specify loss function
    loss_fn = nn.CrossEntropyLoss()

    # Instantiate the model run
    run = run_model(model, optimizer, loss_fn, device)

    # Run the train/validation
    results = run.train(train_dataloader, val_dataloader, EPOCHS)

count = 1  # number of runs to execute
sweep_id = wandb.sweep(sweep_config, project="cs6250-project", entity="cs7643-teamscam")
wandb.agent(sweep_id, function=run, count=count)




In [None]:
# Get Train and Validation DataLoader
train_dataloader, val_dataloader, embeddings_tensor = get_data(h5py_file, device, batch_size, phenotype)

In [None]:
import torch.optim as optim

# Instantiate CNN model
cnn_model = cnn_model.CNN_NLP(pretrained_embedding=embeddings_tensor,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=300,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        num_classes=2,
                        dropout=0.5)
    
# Send model to `device` (GPU/CPU)
cnn_model.to(device)

# Instantiate Adadelta optimizer
optimizer = optim.Adadelta(cnn_model.parameters(),lr=learning_rate, rho=0.95)

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

# Instantiate the model run
run = run_model(cnn_model, optimizer, loss_fn, device)

In [None]:
# Start the new run
run.train(train_dataloader, val_dataloader, epochs)