### Import data

In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/Train.csv")
tags = pd.read_csv("data/Tags.csv")
test_data = pd.read_csv("data/Test.csv")

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14004 entries, 0 to 14003
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   id                                            14004 non-null  int64 
 1   ABSTRACT                                      14004 non-null  object
 2   Computer Science                              14004 non-null  int64 
 3   Mathematics                                   14004 non-null  int64 
 4   Physics                                       14004 non-null  int64 
 5   Statistics                                    14004 non-null  int64 
 6   Analysis of PDEs                              14004 non-null  int64 
 7   Applications                                  14004 non-null  int64 
 8   Artificial Intelligence                       14004 non-null  int64 
 9   Astrophysics of Galaxies                      14004 non-null  int64 
 10

In [3]:
TOPIC_COLS = ['Computer Science','Mathematics','Physics','Statistics']
TAGS = list(tags['Tags'])

In [4]:
list(train_data['ABSTRACT'])[:2]

['a ever-growing datasets inside observational astronomy have challenged scientists inside many aspects, including an efficient and interactive data exploration and visualization. many tools have been developed to confront this challenge. however, they usually focus on displaying a actual images or focus on visualizing patterns within catalogs inside the predefined way. inside this paper we introduce vizic, the python visualization library that builds a connection between images and catalogs through an interactive map of a sky region. vizic visualizes catalog data over the custom background canvas with the help of a shape, size and orientation of each object inside a catalog. a displayed objects inside a map are highly interactive and customizable comparing to those inside a images. these objects should be filtered by or colored by their properties, such as redshift and magnitude. they also should be sub-selected with the help of the lasso-like tool considering further analysis with th

### Preprocessing

In [5]:
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.lower()
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.lower()

In [6]:
import string
train_data['ABSTRACT'] = train_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))
test_data['ABSTRACT'] = test_data['ABSTRACT'].str.translate(str.maketrans('', '', string.punctuation))

### Test Train Split

In [7]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_data,test_size=0.2,random_state=2)

### Common Functions

In [8]:
from sklearn.metrics import f1_score

# Get best threshold for each label
def get_cut_offthreshold(y_pred_prob,validation_set,TAGS):
    '''
    Get best threshold cut off for different labels to maximize the micro F1 score
    '''
    thresholds = np.array(list(range(0,100)))/100.0
    best_thresholds = []
    for idx in range(0,25):
        scores = [f1_score(validation_set[TAGS[idx]], y_pred_prob[:,idx] > thresh, average='micro') for thresh in thresholds]
        best_thresh = thresholds[np.argmax(scores)]
        best_thresholds.append(best_thresh)
    return best_thresholds

def get_predictions(pred_prob,best_thresholds,TAGS):
    '''
    Get predictions based on probabilities and class specific thresholds
    '''
    predictions = np.zeros((pred_prob.shape[0],len(TAGS)))
    for idx in range(0,25):
        predictions[:,idx] = pred_prob[:,idx] > best_thresholds[idx]    
    return predictions

### Experiment 1:  Using Count Vectorizer + OneVsRestClassifier + Logistic Regression

In [9]:
# Using Count Vectorizer + OneVsRestClassifier + Logistic Regression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


vec = CountVectorizer(max_features=10000)
vec.fit(list(train['ABSTRACT']))
        


trn_abs = vec.transform(train['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
test_abs = vec.transform(test_data['ABSTRACT'])

clf = OneVsRestClassifier(LogisticRegression(C=10,n_jobs=1,max_iter=1000))
clf.fit(trn_abs,train[TAGS])

OneVsRestClassifier(estimator=LogisticRegression(C=10, max_iter=1000, n_jobs=1))

In [10]:
y_pred_prob = clf.predict_proba(val_abs)
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.6238731737643767


In [11]:
# Predicting on Test

y_pred_test_prob = clf.predict_proba(test_abs)

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("cv_logistic.csv",index=False)

### Experiment 2:  Using TFIDF Vectorizer + OneVsRestClassifier + Logistic Regression

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


vec = TfidfVectorizer(max_features=10000)
vec.fit(list(train['ABSTRACT']))


TAGS = list(tags['Tags'])
trn_abs = vec.transform(train['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
test_abs = vec.transform(test_data['ABSTRACT'])

clf = OneVsRestClassifier(LogisticRegression(C=10,n_jobs=1,max_iter=1000))
clf.fit(trn_abs,train[TAGS])


OneVsRestClassifier(estimator=LogisticRegression(C=10, max_iter=1000, n_jobs=1))

In [13]:
y_pred_prob = clf.predict_proba(val_abs)
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.6869462492616657


In [14]:
# Predicting on Test
y_pred_test_prob = clf.predict_proba(test_abs)

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("tfidf_logistic.csv",index=False)

### Experiment 3: Using TFIDF Vectorizer With Topic Columns + OneVsRestClassifier + Logistic Regression

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


vec = TfidfVectorizer(max_features=10000)
vec.fit(list(train['ABSTRACT']))
        

TAGS = list(tags['Tags'])
trn_abs = vec.transform(train['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
test_abs = vec.transform(test_data['ABSTRACT'])


In [16]:
trn_abs = np.hstack((trn_abs.toarray(),train[TOPIC_COLS]))
val_abs = np.hstack((val_abs.toarray(),val[TOPIC_COLS]))
test_abs = np.hstack((test_abs.toarray(),test_data[TOPIC_COLS]))

In [17]:
clf = OneVsRestClassifier(LogisticRegression(C=10,n_jobs=1,max_iter=1000))
clf.fit(trn_abs,train[TAGS])


OneVsRestClassifier(estimator=LogisticRegression(C=10, max_iter=1000, n_jobs=1))

In [18]:
y_pred_prob = clf.predict_proba(val_abs)
best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))

F1 Score on Validation Set 0.7622750179985601


In [19]:
# Predicting on Test
y_pred_test_prob = clf.predict_proba(test_abs)

predictions = get_predictions(y_pred_test_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']
result.to_csv("tfidf_logistic_topic_cols.csv",index=False)

### Experiment 4 : NB-SVM using TFIDF Features

In [20]:
# Probabilities
def get_prob(x_feats,y_i, y):
    p = x_feats[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

# NB Log count ratios
def get_nb_feats(x_feats,y):
    log_rat = np.log(get_prob(x_feats,1,y) / get_prob(x_feats,0,y))
    x_feats_nb = x_feats.multiply(log_rat)
    return x_feats_nb,log_rat

def get_model(x_feats,y):
    y = y.values       
    x_feats_nb,log_rat = get_nb_feats(x_feats,y)
    model = LogisticRegression(C=10,max_iter=1000)
    model.fit(x_feats_nb, y)
    return model, log_rat

def get_model_topic(x_feats,topic_cols,y):
    y = y.values       
    x_feats_nb,log_rat = get_nb_feats(x_feats,y)
    x_feats_nb_topic_cols = np.hstack((x_feats_nb.toarray(),topic_cols))
    model = LogisticRegression(C=10,max_iter=1000)
    model.fit(x_feats_nb_topic_cols, y)
    return model, log_rat

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9,use_idf=1,
               smooth_idf=1, sublinear_tf=1,max_features=10000)

trn_term_doc = vec.fit_transform(train['ABSTRACT'])
val_term_doc = vec.transform(val['ABSTRACT'])
test_term_doc = vec.transform(test_data['ABSTRACT'])


y_pred_prob = np.zeros((val_term_doc.shape[0], len(TAGS)))

models = []
log_rats = []

for i, tag in enumerate(TAGS):
    print('Fitting ..', tag)
    model,log_rat = get_model(trn_term_doc,train[tag])
    models.append(model)
    log_rats.append(log_rat)
    y_pred_prob[:,i] = model.predict_proba(val_term_doc.multiply(log_rat))[:,1]


best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))



Fitting .. Analysis of PDEs
Fitting .. Applications
Fitting .. Artificial Intelligence
Fitting .. Astrophysics of Galaxies
Fitting .. Computation and Language
Fitting .. Computer Vision and Pattern Recognition
Fitting .. Cosmology and Nongalactic Astrophysics
Fitting .. Data Structures and Algorithms
Fitting .. Differential Geometry
Fitting .. Earth and Planetary Astrophysics
Fitting .. Fluid Dynamics
Fitting .. Information Theory
Fitting .. Instrumentation and Methods for Astrophysics
Fitting .. Machine Learning
Fitting .. Materials Science
Fitting .. Methodology
Fitting .. Number Theory
Fitting .. Optimization and Control
Fitting .. Representation Theory
Fitting .. Robotics
Fitting .. Social and Information Networks
Fitting .. Statistics Theory
Fitting .. Strongly Correlated Electrons
Fitting .. Superconductivity
Fitting .. Systems and Control
F1 Score on Validation Set 0.679727180380206


In [22]:
y_pred_prob = np.zeros((test_term_doc.shape[0], len(TAGS)))

for i, tag in enumerate(TAGS):   
    y_pred_prob[:,i] = models[i].predict_proba(test_term_doc.multiply(log_rats[i]))[:,1]


    
predictions = get_predictions(y_pred_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']

result.to_csv("nbsvm.csv",index=False)


### Experiment 5 : NB-SVM using TFIDF Features with Topic Columns

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


def get_val_feats(x_feats,log_rat,topic_cols):
    x_feats_nb = x_feats.multiply(log_rat)    
    x_feats_nb_topic_cols = np.hstack((x_feats_nb.toarray(),topic_cols))
    return x_feats_nb_topic_cols

vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9,use_idf=1,
               smooth_idf=1, sublinear_tf=1,max_features=10000)

trn_term_doc = vec.fit_transform(train['ABSTRACT'])
val_term_doc = vec.transform(val['ABSTRACT'])
test_term_doc = vec.transform(test_data['ABSTRACT'])


y_pred_prob = np.zeros((val_term_doc.shape[0], len(TAGS)))

models = []
log_rats = []

for i, tag in enumerate(TAGS):
    print('Fitting ..', tag)
    model,log_rat = get_model_topic(trn_term_doc,train[TOPIC_COLS],train[tag])
    models.append(model)
    log_rats.append(log_rat)
    y_pred_prob[:,i] = model.predict_proba(get_val_feats(val_term_doc,log_rat,val[TOPIC_COLS]))[:,1]


best_thresholds = get_cut_offthreshold(y_pred_prob,val,TAGS)
y_pred = get_predictions(y_pred_prob,best_thresholds,TAGS)

print("F1 Score on Validation Set", f1_score(val[TAGS], y_pred, average='micro'))



Fitting .. Analysis of PDEs
Fitting .. Applications
Fitting .. Artificial Intelligence
Fitting .. Astrophysics of Galaxies
Fitting .. Computation and Language
Fitting .. Computer Vision and Pattern Recognition
Fitting .. Cosmology and Nongalactic Astrophysics
Fitting .. Data Structures and Algorithms
Fitting .. Differential Geometry
Fitting .. Earth and Planetary Astrophysics
Fitting .. Fluid Dynamics
Fitting .. Information Theory
Fitting .. Instrumentation and Methods for Astrophysics
Fitting .. Machine Learning
Fitting .. Materials Science
Fitting .. Methodology
Fitting .. Number Theory
Fitting .. Optimization and Control
Fitting .. Representation Theory
Fitting .. Robotics
Fitting .. Social and Information Networks
Fitting .. Statistics Theory
Fitting .. Strongly Correlated Electrons
Fitting .. Superconductivity
Fitting .. Systems and Control
F1 Score on Validation Set 0.7521122726621797


In [24]:
y_pred_prob = np.zeros((test_term_doc.shape[0], len(TAGS)))

for i, tag in enumerate(TAGS):   
    y_pred_prob[:,i] = models[i].predict_proba(get_val_feats(test_term_doc,log_rats[i],test_data[TOPIC_COLS]))[:,1]


    
predictions = get_predictions(y_pred_prob,best_thresholds,TAGS)

result = pd.DataFrame(predictions)
result.columns = TAGS
result['id'] = test_data['id']

result.to_csv("nbsvm_topic_cols.csv",index=False)