In [35]:
import psycopg2
import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import label_ranking_average_precision_score
from simpletransformers.classification import ClassificationArgs

In [5]:
con = psycopg2.connect(
  database="db_dsa", 
  user="udsa", 
  password="P@ssword2021", 
  host="92.242.58.173", 
  port="1984"
)

In [6]:
cur = con.cursor()
cur.execute("SELECT * from PROJECTS")
rows = cur.fetchall()
column_names=["ID","NUMBER","VACANCIES","NAME","TYPE","TYPEDESC","TYPEID","STATUSID", "STATUSDESC","DIRECTIONHEAD","HEAD","DATACREATED","FACULTYID"]
df = pd.DataFrame(rows, columns=column_names)
df.head()

In [7]:
papers = pd.DataFrame({
    'title': df['NAME'].tolist(),
    'categories': df['TYPE'].tolist()
})

papers['title'] = papers['title'].apply(lambda x: x.replace("\n",""))
papers['title'] = papers['title'].apply(lambda x: x.strip())
papers['text'] = papers['title']

papers['categories'] = papers['categories'].apply(lambda x: tuple(x.split()))

shortlisted_categories = papers['categories'].value_counts().reset_index(name="count").query("count > 1")["index"].tolist()
papers = papers[papers["categories"].isin(shortlisted_categories)].reset_index(drop=True)

papers = papers.sample(frac=1).reset_index(drop=True)

papers = papers.groupby('categories').head(250).reset_index(drop=True)

multi_label_encoder = MultiLabelBinarizer()
multi_label_encoder.fit(papers['categories'])
papers['categories_encoded'] = papers['categories'].apply(lambda x: multi_label_encoder.transform([x])[0])

papers = papers[["text", "categories", "categories_encoded"]]
papers.head()

In [8]:
train, test = train_test_split(papers, test_size=0.3, stratify=papers['categories'], random_state=42)

In [53]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-8, 1e-4, log = True)
    adam_epsilon = trial.suggest_float("adam_epilson", 1e-8, 1e-4, log = True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)

    model_args = {
        'sliding_window':False,
        'learning_rate':learning_rate,
        'adam_epsilon':adam_epsilon,
        'train_batch_size':8,
        'eval_batch_size':4,
        'num_train_epochs':num_train_epochs,
        'do_lower_case':False,
        'max_seq_length':512,
        'overwrite_output_dir':True
    }
    model = MultiLabelClassificationModel('bert', 
                                      'bert-base-cased', 
                                      num_labels=len(shortlisted_categories), 
                                      args=model_args, use_cuda=True)

    model.train_model(train[['text', 'categories_encoded']])
    result, model_outputs, wrong_predictions = model.eval_model(test[['text', 'categories_encoded']])
    return result['LRAP']

In [54]:
study = optuna.create_study(study_name="bertmodel")
study.optimize(objective, n_trials=10)

In [56]:
print("Best Params : {}".format(study.best_params))

print("Best LRAP : {}".format(study.best_value))

In [59]:
study.best_trial