### Import data

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/Train.csv")
tags = pd.read_csv("data/Tags.csv")
test_data = pd.read_csv("data/Test.csv")

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14004 entries, 0 to 14003
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   id                                            14004 non-null  int64 
 1   ABSTRACT                                      14004 non-null  object
 2   Computer Science                              14004 non-null  int64 
 3   Mathematics                                   14004 non-null  int64 
 4   Physics                                       14004 non-null  int64 
 5   Statistics                                    14004 non-null  int64 
 6   Analysis of PDEs                              14004 non-null  int64 
 7   Applications                                  14004 non-null  int64 
 8   Artificial Intelligence                       14004 non-null  int64 
 9   Astrophysics of Galaxies                      14004 non-null  int64 
 10

In [3]:
TOPIC_COLS = ['Computer Science','Mathematics','Physics','Statistics']
TAGS = list(tags['Tags'])

In [4]:
list(train_data['ABSTRACT'])[:2]

['a ever-growing datasets inside observational astronomy have challenged scientists inside many aspects, including an efficient and interactive data exploration and visualization. many tools have been developed to confront this challenge. however, they usually focus on displaying a actual images or focus on visualizing patterns within catalogs inside the predefined way. inside this paper we introduce vizic, the python visualization library that builds a connection between images and catalogs through an interactive map of a sky region. vizic visualizes catalog data over the custom background canvas with the help of a shape, size and orientation of each object inside a catalog. a displayed objects inside a map are highly interactive and customizable comparing to those inside a images. these objects should be filtered by or colored by their properties, such as redshift and magnitude. they also should be sub-selected with the help of the lasso-like tool considering further analysis with th

### Preprocessing

In [5]:
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.lower()
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.lower()

In [6]:
import string
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))

### Test Train Split

In [7]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_data,test_size=0.2,random_state=2)

### Common Functions

In [27]:
from sklearn.metrics import f1_score

# Get best threshold for each label
def get_cut_offthreshold(y_pred_prob,validation_set,TAGS):
    '''
    Get best threshold cut off for different labels to maximize the micro F1 score
    '''
    thresholds = np.array(list(range(0,100)))/100.0
    best_thresholds = []
    for idx in range(0,25):
        scores = [f1_score(validation_set[TAGS[idx]], y_pred_prob[:,idx] > thresh, average='micro') for thresh in thresholds]
        best_thresh = thresholds[np.argmax(scores)]
        best_thresholds.append(best_thresh)
    return best_thresholds

def get_predictions(pred_prob,best_thresholds,TAGS):
    '''
    Get predictions based on probabilities and class specific thresholds
    '''
    predictions = np.zeros((pred_prob.shape[0],len(TAGS)))
    for idx in range(0,25):
        predictions[:,idx] = pred_prob[:,idx] > best_thresholds[idx]    
    return predictions

# Creating files for FastText Training

In [22]:
TAGS = np.array(TAGS)
all_abstracts = list(train['ABSTRACT'])
row_no = np.where(np.array(train[TAGS])==1)[0]
tag_name = TAGS[np.where(np.array(train[TAGS])==1)[1]]

x = pd.DataFrame({'row_no':row_no,'tag':tag_name})
x['tag'] = x.tag.apply(lambda x: [x.lower()])
x_grouped = x.groupby(row_no)['tag'].agg('sum').reset_index()
x_grouped
f = open("avhack_tags_train.txt","w")
for i,each in enumerate(x_grouped.iterrows()):
    label = ""
    for y in each[1]['tag']:
        label = label + "__label" + "__" + "_".join(y.split()) + " "
    out_text = label + all_abstracts[i]    
    f.write(out_text)
    f.write("\n")

f.close()


all_abstracts = list(val['ABSTRACT'])
row_no = np.where(np.array(val[TAGS])==1)[0]
tag_name = TAGS[np.where(np.array(val[TAGS])==1)[1]]
x = pd.DataFrame({'row_no':row_no,'tag':tag_name})
x['tag'] = x.tag.apply(lambda x: [x.lower()])
x_grouped = x.groupby(row_no)['tag'].agg('sum').reset_index()
x_grouped
f = open("avhack_tags_val.txt","w")
for i,each in enumerate(x_grouped.iterrows()):
    label = ""
    for y in each[1]['tag']:
        label = label + "__label" + "__" + "_".join(y.split()) + " "
    out_text = label + all_abstracts[i]    
    f.write(out_text)
    f.write("\n")

f.close()

In [23]:
import fasttext
model = fasttext.train_supervised(input="avhack_tags_train.txt", lr=0.1, epoch=200, wordNgrams=2, bucket=200000, dim=50, loss='ova')
model.test("avhack_tags_val.txt", k=-1)

(2801, 0.05356658336308461, 1.0)

In [24]:
all_abstracts = list(val['ABSTRACT'])
predictions = model.predict(all_abstracts,k=-1,threshold=0)

In [25]:
def get_pred_prob(predictions):
    fasttext_labels = ["__label" + "__" + "_".join(each.lower().split()) for each in TAGS]

    all_label_probs = np.empty((0,25))
    for pred_record,pred_probs in zip(predictions[0],predictions[1]):
        label_probs = pd.DataFrame({'label':pred_record,'prob':pred_probs})
        label_probs = label_probs.sort_values(by='label')
        label_order = list(label_probs['label'])
        label_probs = np.expand_dims(np.array(list(label_probs['prob'])),axis=0)
        all_label_probs = np.concatenate((all_label_probs,label_probs),axis=0)

    prediction_pd = pd.DataFrame(all_label_probs)
    prediction_pd.columns = label_order

    prediction_pd = prediction_pd[fasttext_labels]
    prediction_pd.columns = TAGS
    y_pred_test = np.array(prediction_pd)
    return y_pred_test

y_pred_prob = get_pred_prob(predictions)

In [28]:
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.6504358280733392


In [29]:
# Predicting on Test
test_predictions = model.predict(list(test_data['ABSTRACT']),k=-1,threshold=0)
y_pred_test_prob = get_pred_prob(test_predictions)

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("fasttext_classifier.csv",index=False)