In [None]:
!pip install sklearn

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
import random

In [None]:
data_folder = 'drive/My Drive/Colab Notebooks/deepALForCalibration/datasets/binary/chemicals_disease/' #specify the path to the folder where you keep your datasets
dataToTrain = '3_train_indexed_chemicals_disease_binary.csv'              # file name for your training data
dataToVal = '3_val_indexed_chemicals_disease_binary.csv'                  # file name for your validation data
dataToTest = '3_test_indexed_chemicals_disease_binary.csv'                # file name for your test data

# columns of the csv file used in the experiments: text/content for each item, gold labels for each item, confidence scores for each class, ID of each item 
# specify the column names of your data
iID = 'itemID'             # give each item an ID, it will be used during active learning
goldLabel = 'crowd_label'  # define the name of column where you keep the gold labels of your data
txt = 'text'               # define the name of column where you keep the items 

parameter_space = {                                                   # define the parameter space you want to search on
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

maxFeat = 1024            # define the maximum number of features you want to obtain in tfidf vectors
nGramRange  = (1, 3)      # define the ngram range


# specify data directories
unlabeled_data_dir = data_folder + dataToTrain
validation_data_dir = data_folder + dataToVal
test_data_dir = data_folder + dataToTest

# PARAMETERS
num_labels = 2                                                       # number of classes in your data
mClass = [0, 1]                                                    # define all of possible classes
minimum_training_items = 86                                           # minimum number of training items before we first train a model

In [None]:
## Feature Preparation
def prepare_features(X_train, min_df=2, max_features=None, ngram_range=(1, 3)):
    # compute tfidf features
    tfidf = TfidfVectorizer(min_df=min_df, max_features=max_features,
                strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                ngram_range=ngram_range, use_idf=1, smooth_idf=1, sublinear_tf=1,
                stop_words=None, lowercase=False)

    X_train_tfidf = tfidf.fit_transform(X_train).toarray()
    print("X_train_tfidf.shape", X_train_tfidf.shape)
    return X_train_tfidf

class Data():
    
    def __init__(self, filename):
        
        # each dataset will have a pool of data, together with their IDs and gold labels 
        self.poolData = np.array([])
        self.poolGoldLabels = np.array([])
        
        dt = pd.read_csv(filename)
        indices = dt[iID].values
        y = dt[goldLabel].values
        X = prepare_features(dt[txt].tolist(), min_df= 0, max_features = maxFeat, ngram_range = nGramRange)
        
        self.data = dt
        self.poolDataEmb = X
        self.poolGoldLabels = y
        self.poolDataIndices = indices
        
    def setStartState(self, nStart):
        ''' This functions initialises fields indicesKnown and indicesUnknown which contain the datapoints having final labels(known) and still explorable(unknown) ones.
        Input:
        nStart -- number of labelled datapoints (size of indicesKnown)
        '''
        self.nStart = nStart
        self.indicesKnown = np.array([])
        self.indicesUnknown = np.array([])
        
        # get predefined points so that all classes are represented and initial classifier could be trained.

        for cls in mClass:
            indices = np.array(np.where(self.poolGoldLabels == cls)).tolist()[0]
            sampledIndices = random.sample(indices, nStart // len(mClass))
            dataIndices = np.array(self.poolDataIndices)
            if self.indicesKnown.size == 0:
                self.indicesKnown = dataIndices[sampledIndices]
            else:
                self.indicesKnown = np.concatenate(([self.indicesKnown, dataIndices[sampledIndices]])); 
        for i in self.poolDataIndices:
            if i not in self.indicesKnown:
                if self.indicesUnknown.size == 0:
                    self.indicesUnknown = np.array([i])
                else:
                    self.indicesUnknown = np.concatenate(([self.indicesUnknown, np.array([i])]));


In [None]:
pool = Data(unlabeled_data_dir)
pool.setStartState(minimum_training_items)
poolData = pool.data
poolDataIndices = pool.poolDataIndices
poolDataEmb_train = pool.poolDataEmb

train_labels = np.array(poolData[goldLabel].tolist())

model = GridSearchCV(MLPClassifier(max_iter=500), parameter_space, n_jobs=-1, cv=3)
  
model.fit(poolDataEmb_train, train_labels) 

# Best paramete set
print('Best parameters found:\n', model.best_params_)

# All results
means = model.cv_results_['mean_test_score']
stds = model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))