<a href="https://colab.research.google.com/github/ashavish/Hackathon/blob/master/AV_NLP_HACK/SentenceTransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Files

In [32]:
from google.colab import files
files.upload()

In [3]:
# !unzip -q Test_H6bikL1.zip
# !unzip -q Train_aO7sTW8.zip
!ls

__MACOSX     Tags.csv  Test_H6bikL1.zip   Train.csv
sample_data  Test.csv  Train_aO7sTW8.zip


## Load Data

In [4]:
import pandas as pd
import numpy as np
import string

train_data = pd.read_csv("Train.csv")
tags = pd.read_csv("Tags.csv")
test_data = pd.read_csv("Test.csv")
TOPIC_COLS = ['Computer Science','Mathematics','Physics','Statistics']
TAGS = list(tags['Tags'])

train_data['ABSTRACT'] = train_data['ABSTRACT'].str.lower()
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.lower()

train_data['ABSTRACT'] = train_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))

## Train Validation Split

In [5]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_data,test_size=0.2,random_state=2)

## Sentence Transformers

In [8]:
#!pip install sentence_transformers

In [13]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

sentences = list(train['ABSTRACT'])+list(val['ABSTRACT'])
sentence_embeddings = sbert_model.encode(sentences)

In [14]:
sentence_embeddings.shape

(14004, 768)

In [20]:
from sklearn.preprocessing import normalize
sentence_embeddings = normalize(sentence_embeddings,axis=1)

In [21]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
clf = OneVsRestClassifier(LogisticRegression(C=10,n_jobs=1,max_iter=1000))

trn_emb = sentence_embeddings[:len(train)]
val_emb = sentence_embeddings[len(train):]

clf.fit(trn_emb,train[TAGS])

OneVsRestClassifier(estimator=LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto', n_jobs=1,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

## Best Threshold

In [22]:
from sklearn.metrics import f1_score

# Get best threshold for each label
def get_cut_offthreshold(y_pred_prob,validation_set,TAGS):
    '''
    Get best threshold cut off for different labels to maximize the micro F1 score
    '''
    thresholds = np.array(list(range(0,100)))/100.0
    best_thresholds = []
    for idx in range(0,25):
        scores = [f1_score(validation_set[TAGS[idx]], y_pred_prob[:,idx] > thresh, average='micro') for thresh in thresholds]
        best_thresh = thresholds[np.argmax(scores)]
        best_thresholds.append(best_thresh)
    return best_thresholds

def get_predictions(pred_prob,best_thresholds,TAGS):
    '''
    Get predictions based on probabilities and class specific thresholds
    '''
    predictions = np.zeros((pred_prob.shape[0],len(TAGS)))
    for idx in range(0,25):
        predictions[:,idx] = pred_prob[:,idx] > best_thresholds[idx]    
    return predictions

In [23]:
y_pred_prob = clf.predict_proba(val_emb)
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.6225692849487062


In [27]:
# Predicting on Test

test_emb = sbert_model.encode(list(test_data['ABSTRACT']))

y_pred_test_prob = clf.predict_proba(test_emb)

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("sent_transf_logistic.csv",index=False)

# Including Topic Columns

In [28]:
trn_abs = np.hstack((trn_emb,train[TOPIC_COLS]))
val_abs = np.hstack((val_emb,val[TOPIC_COLS]))
test_abs = np.hstack((test_emb,test_data[TOPIC_COLS]))

In [29]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
clf = OneVsRestClassifier(LogisticRegression(C=10,n_jobs=1,max_iter=1000))

clf.fit(trn_abs,train[TAGS])

OneVsRestClassifier(estimator=LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto', n_jobs=1,
                                                 penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [30]:
y_pred_prob = clf.predict_proba(val_abs)
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.7093610698365527


In [31]:
# Predicting on Test

y_pred_test_prob = clf.predict_proba(test_abs)

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("sent_transf_topic_cols_logistic.csv",index=False)