### Import data

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/Train.csv")
tags = pd.read_csv("data/Tags.csv")
test_data = pd.read_csv("data/Test.csv")

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14004 entries, 0 to 14003
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   id                                            14004 non-null  int64 
 1   ABSTRACT                                      14004 non-null  object
 2   Computer Science                              14004 non-null  int64 
 3   Mathematics                                   14004 non-null  int64 
 4   Physics                                       14004 non-null  int64 
 5   Statistics                                    14004 non-null  int64 
 6   Analysis of PDEs                              14004 non-null  int64 
 7   Applications                                  14004 non-null  int64 
 8   Artificial Intelligence                       14004 non-null  int64 
 9   Astrophysics of Galaxies                      14004 non-null  int64 
 10

In [3]:
TOPIC_COLS = ['Computer Science','Mathematics','Physics','Statistics']
TAGS = list(tags['Tags'])

### Preprocessing

In [4]:
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.lower()
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.lower()

In [5]:
import string
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))

### Test Train Split

In [6]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_data,test_size=0.2,random_state=2)

### Common Functions

In [7]:
# Get best threshold for each label
def get_cut_offthreshold(y_pred_prob,validation_set,TAGS):
    thresholds = np.array(list(range(0,100)))/100.0
    best_thresholds = []
    for idx in range(0,25):
        scores = [f1_score(validation_set[TAGS[idx]], y_pred_prob[:,idx] > thresh, average='micro') for thresh in thresholds]
        best_thresh = thresholds[np.argmax(scores)]
        best_thresholds.append(best_thresh)
    return best_thresholds

# Get predictions based on probabilities and class specific thresholds
def get_predictions(pred_prob,best_thresholds,TAGS):
    predictions = np.zeros((pred_prob.shape[0],len(TAGS)))
    for idx in range(0,25):
        predictions[:,idx] = pred_prob[:,idx] > best_thresholds[idx]    
    return predictions

### Experiment: Averaged Glove Word Embeddings with Logistic Regression

In [8]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

glove_input_file = 'glove/glove.6B/glove.6B.100d.txt'
word2vec_output_file = 'glove/glove.6B/glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [9]:
# load the Stanford GloVe model
filename = 'glove/glove.6B/glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

#### Averaged Sentence Embeddings

In [10]:
zero_vec = np.zeros(shape=(100,))

def get_sent_emb(words,w2v_vocabulary):
    word_emb = np.array([])
    for each in words.split():
        if each in w2v_vocabulary:
            emb = model[each]
#         else:
#             emb = zero_vec
            if word_emb.shape[0] == 0:
                word_emb = np.expand_dims(emb,axis=0)
            else:
                word_emb = np.concatenate((word_emb,np.expand_dims(emb,axis=0)),axis=0)
    word_emb = np.array(word_emb)
    return np.expand_dims(np.mean(word_emb,axis=0),axis=0)

w2v_vocabulary = model.vocab
sentence_emb = np.array([])
for each in list(train['ABSTRACT']) + list(val['ABSTRACT']) + list(test_data['ABSTRACT']):
    if sentence_emb.shape[0] == 0:
        sentence_emb = get_sent_emb(each,w2v_vocabulary)
    else:
        sentence_emb = np.concatenate((sentence_emb,get_sent_emb(each,w2v_vocabulary)),axis=0)

sentence_emb = np.array(sentence_emb)

# sentence_emb = normalize(sentence_emb,axis=1)
train_emb = sentence_emb[:len(train)]
val_emb = sentence_emb[len(train):len(train)+len(val)]
test_emb = sentence_emb[len(train)+len(val):]

In [12]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize



clf = OneVsRestClassifier(LogisticRegression(C=10,n_jobs=1,max_iter=1000))
clf.fit(train_emb,train[TAGS])


OneVsRestClassifier(estimator=LogisticRegression(C=10, max_iter=1000, n_jobs=1))

In [13]:
y_pred_prob = clf.predict_proba(val_emb)
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.5534709193245778


In [14]:
# Predicting on Test

y_pred_test_prob = clf.predict_proba(test_emb)

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("glove_word_logistic.csv",index=False)