In [None]:
import pandas as pd
studies = pd.read_csv("../resources/processed-goldstandard-CT.tsv", sep = '\t', encoding='utf8')
studies.fillna("", inplace=True)
studies.head()

In [None]:
feature_names = ["official_title",
                "brief_summary",
                "detailed_description",
                "study_design_info",
                "outcomes",
                "conditions",
                "arm_groups",
                "drug_interventions",
                "other_interventions",
                "inclusion_criteria",
                "mesh_terms_conditions",
                "mesh_terms_interventions"]

In [None]:
studies["conditions"]= studies["conditions"].str.replace(";", " ")
studies["mesh_terms_conditions"]= studies["mesh_terms_conditions"].str.replace(";", " ")
studies["mesh_terms_interventions"]= studies["mesh_terms_interventions"].str.replace(";", " ")
studies["drug_interventions"]= studies["drug_interventions"].str.replace(";", " ")
studies["other_interventions"]= studies["other_interventions"].str.replace(";", " ")

In [None]:
for i in range(1):
    for f in feature_names:
        print(f + ": " + str(studies.loc[i+1, f]), end="\n\n")

In [None]:
studies['X'] = studies.apply(lambda r: ' '.join(r[feature] for feature in feature_names), axis=1)

In [None]:
studies["X"][1]

In [None]:
import numpy as np
np.unique(studies['pm_rel_desc'])

studies["pm"] = 0
studies["pm"][studies["pm_rel_desc"] == "Human PM"] = 1
studies["pm"][studies["pm_rel_desc"] == "Animal PM"] = 1
studies["pm"][:10]

In [None]:
print(sum(studies.pm == 0))
print(sum(studies.pm == 1))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(studies['X'], studies['pm'], test_size=0.25, random_state=33)
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

In [None]:
print("x_train instance: ", X_train[1])
print("y_train instance: ", y_train[1])

In [None]:
import sklearn
from sklearn import preprocessing

#Encode from string to numbers
enc = preprocessing.LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)

In [None]:
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

In [None]:
n_words = 10000
vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer,
                             stop_words=stopwords.words('english') + list(string.punctuation),lowercase = True, max_features = n_words)
tfidf = vectorizer.fit(X_train.values.astype('U'))

In [None]:
X_test = tfidf.transform(X_test.values.astype('U'))
X_train = tfidf.transform(X_train.values.astype('U'))

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

In [None]:
print(X_train[0])

In [None]:
import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=10800#1200#86400
)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Final Models:", automl.show_models())

In [None]:
print("Confusion Matrix: \n", sklearn.metrics.confusion_matrix(y_test, y_hat, labels=[1,0]))
print("Precision: ", sklearn.metrics.precision_score(y_test, y_hat))
print("Recall: ", sklearn.metrics.recall_score(y_test, y_hat))
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_hat))

In [None]:
import pickle
VECTORIZER_NAME = "../models/tfidfmodel_studies.sav"
pickle.dump(tfidf, open(VECTORIZER_NAME, 'wb'))

In [None]:
MODELNAME = "trec_model_studies.sav"
pickle.dump(automl, open(MODELNAME, 'wb'))
 
# load the model from disk
loaded_model = pickle.load(open(MODELNAME, 'rb'))
y_hat2 = loaded_model.predict(X_test)
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_hat2))