# MLRW 2022 : AI Driven Biomedical Hackathon

In [39]:
#Importing all the Libraries

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import inaugural, stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
import scipy
import warnings

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to C:\Users\Amit
[nltk_data]     Bera\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Amit
[nltk_data]     Bera\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Amit
[nltk_data]     Bera\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [164]:
#Loading all the CSV files and removing unnecessary columns
train = pd.read_csv("labelled_train_data.csv").iloc[:, 1:]
train['target'] = train['ctrl']
train = train.drop(['ctrl','pert'], axis = 1)
unlabel = pd.read_csv("unlabelled_train_data.csv").iloc[:, 1:]
test = pd.read_csv("data_only_test.csv").iloc[:, 1:]

In [202]:
#Defining all the text preprocessing functions
text_col = ['characteristics_ch1', 'contact_city', 'contact_country',
       'contact_department', 'contact_institute', 'data_processing', 'extract_protocol_ch1', 'growth_protocol_ch1',
       'hyb_protocol', 'label_ch1', 'label_protocol_ch1',
       'molecule_ch1', 'organism_ch1', 'scan_protocol',
       'source_name_ch1', 'title', 'treatment_protocol_ch1', 'type','description']

wordnet_lemm = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def text_cleaner(text,wordnet_lemm=wordnet_lemm,stop_words=stop_words):
    text = re.sub("[^a-zA-Z0-9- ]", " ", str(text))
    text = text.lower()
    text = [wordnet_lemm.lemmatize(w) for w in text.split() if w not in stop_words and len(w)>1]
    return ' '.join(text)

def df_text(df):
    for i in tqdm(text_col):
        df[i] = df[i].fillna('unknown-text')
        temp_text = []
        for text in df[i].values:
            temp_text.append(text_cleaner(text))
        df[i] = temp_text
    return df

def combo_text(df):
    df['combo'] = df['characteristics_ch1'] + " "+ df['description'] + " " + df['source_name_ch1'] + " " + df['title']
    return df

In [203]:
train = combo_text(train)
train = df_text(train)
unlabel = combo_text(unlabel)
unlabel = df_text(unlabel)
test = combo_text(test)
test = df_text(test)

100%|██████████| 19/19 [00:00<00:00, 70.38it/s]
100%|██████████| 19/19 [00:07<00:00,  2.52it/s]
100%|██████████| 19/19 [00:02<00:00,  8.44it/s]


In [204]:
#We used cosine similarity to get those points from the unlabeled data which are similar to the train data.
#This will help us to retain the fact that our train and unlabelled data are having similar distribution.
vec = TfidfVectorizer(stop_words=stop_words)
train_vec = vec.fit_transform(train['combo'].values)
unlabel_vec = vec.transform(unlabel['combo'].values)
unlabel_index = []
for index, value in tqdm(enumerate(unlabel_vec)):
    counts = 0
    for j in train_vec:
        if cosine_similarity(value,j) != 0.0:
            counts += 1
    if counts >= 300:
        unlabel_index.append(index)
#Will take roughly ~2 hours to run this block of cell
#Run the below cell to download the pickle file of unlabel data indexs which are similar to train data

In [205]:
# !pip install gdown
# !gdown --id 1j8KP9zsHuujiN6R-TfDntudAPeIezxXQ
# import pickle 
# file = open('unlabel_index.p','rb')
# unlabel_index = pickle.load(file)
# file.close()

In [206]:
#Creating a new dataframe with all the similar text data
unlabel_data = unlabel.iloc[unlabel_index]
unlabel_data.shape

(3915, 51)

In [207]:
#Now training the model on the train data and predicting the labels for the unlabel data
col_name = ['characteristics_ch1','source_name_ch1','description','title']
#vec_dict will save the TFIDF objects of the columns in col_name
vec_dict = {}
all_enc = []
for i in col_name:
    vec = TfidfVectorizer()
    all_enc.append(vec.fit_transform(train[i].values))
    vec_dict[i] = vec
X = hstack(all_enc)
Y = train['target']
X.shape,Y.shape

((623, 1747), (623,))

In [208]:
dt = DecisionTreeClassifier(random_state=1811)
dt.fit(X, Y)
y_pred = dt.predict(X)
print(classification_report(Y,y_pred))
print(confusion_matrix(Y,y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       319
         1.0       1.00      1.00      1.00       304

    accuracy                           1.00       623
   macro avg       1.00      1.00      1.00       623
weighted avg       1.00      1.00      1.00       623

[[319   0]
 [  0 304]]


In [172]:
#Converting the unlabel data to TFIDF and predicting the labels
all_enc = []
for i in col_name:
    vec = vec_dict[i]
    all_enc.append(vec.transform(unlabel_data[i].values))
X_unlabel = hstack(all_enc)
unlabel_data['target'] = dt.predict(X_unlabel)
unlabel_data['target'].value_counts()

0.0    2414
1.0    1501
Name: target, dtype: int64

In [173]:
#We did an extensive EDA and research on the textual data for train adnd unlabel data.
#The important question to ask before using ML is, How can we solve this without using ML?
#After EDA, we found that the text data is having a lot of important keywords which can be used to predict the labels without using Machine Learning.
#For example, if control or ctrl appears in the text, it implies that the datapoint is a ctrl point.
#If case appears in the text and control / ctrl doesnt appear in the same text, it implies that the datapoint is a pert point. 
#We did some research on case-control studies in biostatistics, thus we have used the keyword 'case' for classification
#Applying the same logic to the predicted unlabel data.
tr = []
for i,rows in unlabel_data.iterrows():
    if 'control' in rows['title'] or 'control' in rows['source_name_ch1'] or 'control' in rows['characteristics_ch1']:
        tr.append(1)
    elif 'ctrl' in rows['title'] or 'ctrl' in rows['source_name_ch1'] or 'ctrl' in rows['characteristics_ch1']:
        tr.append(1)
    elif ('case' in  rows['title'] and 'control' not in rows['title']) or ('case' in  rows['source_name_ch1'] and 'control' not in rows['source_name_ch1']) or ('case' in  rows['characteristics_ch1'] and 'control' not in rows['characteristics_ch1']):
        tr.append(0)
    else:
        tr.append(rows['target'])
unlabel_data['target'] = tr
unlabel_data['target'].value_counts()

0.0    2391
1.0    1524
Name: target, dtype: int64

In [192]:
#We have 600 labelled points. Using these points we got 3.9k datapoints from unlabelled data,
#which are most similar to our train data. Out of this 3.9k datapoints, we took 1200 datapoints for our use i.e 600 of class label 0 and 600 of class label 1.
#So our main dataset also have a balanced distribution of labels.
b1 = unlabel_data[unlabel_data['target'] == 0].sample(600)
b2 = unlabel_data[unlabel_data['target'] ==1].sample(600)
unlabel_balance = pd.concat([b1,b2])
unlabel_balance.shape

(1200, 52)

In [193]:
main_data = pd.concat([train, unlabel_balance])
main_data.shape

(1823, 52)

In [194]:
#TFIDF vectorizer for the main data, but using the TFIDF object of train data
all_enc = []
for i in col_name:
    vec = vec_dict[i]
    all_enc.append(vec.transform(main_data[i].values))
X = hstack(all_enc)
Y = main_data['target'].values
X.shape, Y.shape

((1823, 1749), (1823,))

In [195]:
#Training the model on the main data
dt = DecisionTreeClassifier(random_state=650)
dt.fit(X, Y)

DecisionTreeClassifier(random_state=650)

### Test Data 

In [209]:
#Reading the test data and preproceessing it
test = pd.read_csv("data_only_test.csv").iloc[:, 1:]
test_ = test.copy()
test = df_text(test)

100%|██████████| 19/19 [00:02<00:00,  7.54it/s]


In [210]:
#TFIDF vectorizer for the test data
all_enc = []
for i in col_name:
    vec = vec_dict[i]
    all_enc.append(vec.transform(test[i].values))
X_test = hstack(all_enc)
X_test.shape

(6070, 1747)

In [211]:
#Predicting the labels for the test data and applying the same logic to the predicted labels
sub = pd.DataFrame()
sub['geo_accession'] = test['geo_accession']
target = dt.predict(X_test)
test['target'] = target
tr = []
for i,rows in test.iterrows():
    if 'control' in rows['title'] or 'control' in rows['source_name_ch1'] or 'control' in rows['characteristics_ch1']:
        tr.append(1)
    elif 'ctrl' in rows['title'] or 'ctrl' in rows['source_name_ch1'] or 'ctrl' in rows['characteristics_ch1']:
        tr.append(1)
    elif ('case' in  rows['title'] and 'control' not in rows['title']) or ('case' in  rows['source_name_ch1'] and 'control' not in rows['source_name_ch1']) or ('case' in  rows['characteristics_ch1'] and 'control' not in rows['characteristics_ch1']):
        tr.append(0)
    else:
        tr.append(rows['target'])
sub['ctrl'] = tr

In [212]:
#Counting the number of model predicted labels 
x = np.array(target)
unique, counts = np.unique(x, return_counts=True)
unique, counts

(array([0., 1.]), array([3883, 2187], dtype=int64))

In [217]:
#Counting the number of model predicted labels + control-case logic
sub['ctrl'].value_counts()

0.0    3814
1.0    2256
Name: ctrl, dtype: int64

In [216]:
sub.to_csv("submission.csv",index=False)