In [23]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from collections import Counter
from numpy import random
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

# Download files, set up folder, put files into folder

In [24]:
training_data_path = './reference_metadata_2013.csv'
test_data_path = './reference_metadata_2020.csv'

In [25]:
# specify data type for each column (to be used in pandas read_csv function)
dtype_dict = {'REFERENCE_ID': str, 'TITLE': str, 'AUTHOR': str, 'YEAR': str, 'ABSTRACT': str, 'CITED': int}

In [26]:
dataframe = pd.read_csv(training_data_path, dtype = dtype_dict, keep_default_na = False)
dataframe

Unnamed: 0,REFERENCE_ID,TITLE,AUTHORS,YEAR,ABSTRACT,CITED
0,38553,On some secondary physiological effects produc...,"Schonbein, CF",1851,,0
1,43125,On the influence of carbonic acid in the air u...,"Arrhenius, S",1896,,0
2,15073,On the influence of ozone inhalation on lung f...,"Bohr, C; Maar, V",1904,,0
3,15027,The physiological influence of ozone,"Hill, L; Flack, M",1912,,0
4,15040,"Ozone: its bactericidal, physiologic and deodo...","Jordan, EO; Carlson, AJ",1913,,0
...,...,...,...,...,...,...
15767,399340,A proposal to use ozone data to validate a sys...,"Devlin, R",,,0
15768,79865,MTBE effect on vehicle emissions at cold tempe...,"Cook, R",,,0
15769,39888,Car population getting older: New registration...,,,,0
15770,80661,An evaluation of the cytotoxicity and mutageni...,"Garrett, NE; Chescheir GM III; Custer, NA; She...",,,0


In [27]:
train_ratio = 0.7 # 70% for training, 30% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 11040
validation set size: 4732


In [29]:
test_dataframe = pd.read_csv(test_data_path, dtype = dtype_dict, keep_default_na = False)
test_dataframe

Unnamed: 0,REFERENCE_ID,TITLE,AUTHORS,YEAR,ABSTRACT
0,43125,On the influence of carbonic acid in the air u...,"Arrhenius, S",1896,
1,15018,Sulfur dioxide and fume problems and their sol...,"O'Gara, PJ",1922,
2,15101,Effects of air pollution in animals,"Stokinger, HE",1962,
3,15024,Ozone injury to the foliage of Pinus ponderosa,"Miller, PR; Parmeter, JR, Jr; Taylor, OC; Card...",1963,
4,15567,The role of ozone in radiation avoidance in th...,"Peterson, DC; Andrews, HL",1963,
...,...,...,...,...,...
171371,1853643,Comparing and Combining CE-ESI-MS and nano-LC-...,"Sarg, B; Faserl, K; Kremser, L; Halfinger, B; ...",,We present the first comprehensive capillary-e...
171372,2230284,Inhibition of NOX/VPO1 pathway and inflammator...,"Liu, B; Luo, XJ; Yang, ZB; Zhang, JJ; Li, TB; ...",,Recent studies show that resveratrol exerts be...
171373,2367014,Role of calmodulin in thermotolerance,"Jia, L; Chu, H; Wu, D; Feng, M; Zhao, L",,Nitric oxide (NO) and hydrogen peroxide (H 2O ...
171374,2369490,Predominance of Biotic over Abiotic Formation ...,"Ruecker, A; Weigold, P; Behrens, S; Jochmann, ...",,Volatile halogenated organic compounds (VOX) c...


# Data exploration for training & test data ... YOUR TURN!

In [None]:
#what does TFIDF check for? 
#TF = term frequency - number of times a word appears in a document 
#IDF = inverse document frequency - how common/rare a word is in the entire corpus

#The idea behind TF-IDF is that words that are more common in a document are less informative, 
#while words that are less common in the document but more common in the collection of 
#documents as a whole are more informative

#words that occur a lot in a document aren't informative, but words that are less common in a document (but found generally in each document)


In [34]:
def print_topk_tfidf_words(df, column_name, k):
    counter = Counter() #gives you a dict of count of objects
    for index, row in dataframe.iterrows(): #iterate through rows
        counter.update(row[column_name].strip().lower().split()) #go through each row value of column name update counter with new values

    #print(counter)
    # sort words by frequency from high to low
    for word, count in sorted(counter.items(), key = lambda x: x[1], reverse = True)[:k]:  #
        print (word, count)

In [35]:
print_topk_tfidf_words(dataframe, 'TITLE', 100)
# print_topk_tfidf_words(dataframe, 'ABSTRACT', 100)
# print_topk_tfidf_words(test_dataframe, 'TITLE', 100)
# print_topk_tfidf_words(test_dataframe, 'TITLE', 100)



# Evaluation metric: work saved over sampling @ 95% recall

In [9]:
# Work saved oversampling at 95% recall (WSS@95%)
    
# This is a metric used for evaluating high-recall ranking results.
# Given a ranked list of items that are labeled as relevant or non-relevant, 
# let X be the percentage of items needed to be sifted to find at least 95%
# of all relevant items, starting from the top of the list. 
    
# If we randomly order the list of items, then X = 95%.
# If a ranked list outperforms random ordering, then we should see X < 95%.
# WSS@95% computes 95% - X, that is, how much work will be saved over 
# random ordering (or random sampling) if the said ranked list is used.

# params:
#     y_true: ground truth label array (1s and 0s)
#     y_pred: predicted score array
#     y_true and y_pred are assumed to have been aligned on REFERENCE_ID
# return:
#     None

def WSS_95(y_true, y_pred):

    res = pd.concat([pd.Series(y_pred), pd.Series(y_true)], axis=1)
    res.columns = ['y_pred', 'y_true']

    # sort res by scores in the submission column
    res.sort_values("y_pred", axis = 0, ascending = False,
                inplace = True)

    # calculate total number of relevant items
    # add a small number for the edge case where 
    # total_num_relevant = 0 and used as denominator
    total_num_relevant = sum(res['y_true']) + 1e-100

    # compute recall at each rank until it first surpasses 95%
    # iterate through the ranked list, calculate recall from top down
    curr_num_relevant = 0
    curr_position = 0
    for i, row in res.iterrows():
        curr_num_relevant += row['y_true']
        curr_position += 1
        curr_recall = curr_num_relevant / total_num_relevant
        if curr_recall >= 0.95:
            break

    print ('total_num_relevant', total_num_relevant)
    print ('curr_position', curr_position)
    
    WSS_95 = 0.95 - curr_position / len(res)


    return WSS_95

# Try the trivial baseline: assign random scores to references in the validation set (no learning is needed)

In [10]:
# Make a list of scores uniformly randomly drawn between 0 and 1 as ranking scores. 
# This trivial baseline gives the performance lower bound on the validation set
# Note: we are using "work saved over sampling at 95% recall" as the performance metric
random_pred = [random.random() for i in range(len(valid_dataframe))]
wss95 = WSS_95(valid_dataframe['CITED'].to_numpy(), random_pred)
print ('WSS@95% of random scoring on validation set:', wss95)

total_num_relevant 598.0
curr_position 4488
WSS@95% of random scoring on validation set: 0.001563820794589943


In [11]:
# helper function: write out ranking scores into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'REFERENCE_ID' as data id
#     pred: a list or 1-d array of scores for each test example
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('REFERENCE_ID', 'Score'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['REFERENCE_ID'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [12]:
random_pred_test = [random.random() for i in range(len(test_dataframe))]
write_test_prediction(test_dataframe, random_pred_test, './random_score.csv')

171376 predictions are written to ./random_score.csv


# Build feature extractor

## Use n-grams (n=1,2) from the 'TITLE' and 'ABSTRACT' fields of training data as features

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2)
vectorizer.fit(train_dataframe['TITLE'] + " " + train_dataframe['ABSTRACT'])

TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 2))

# Extract feature vectors for training, validation, and test data 

In [14]:
train_X = vectorizer.transform(train_dataframe['TITLE'] + " " + train_dataframe['ABSTRACT'])
valid_X = vectorizer.transform(valid_dataframe['TITLE'] + " " + valid_dataframe['ABSTRACT'])
test_X = vectorizer.transform(test_dataframe['TITLE'] + " " + test_dataframe['ABSTRACT'])
print (train_X.shape)
print (valid_X.shape)
print (test_X.shape)

(11040, 155588)
(4732, 155588)
(171376, 155588)


# Train model on training set

In [15]:
# We treat the ranking task as a classification task
# Almost all classification models can output a score that 
# indicates (roughly) how confident the model believes
# an example to belong to a class. 
# Here in the baseline, we use a logistic regression model.
train_Y = train_dataframe['CITED']
model = LogisticRegression(C = 1, solver='liblinear')
model.fit(train_X, train_Y)

LogisticRegression(C=1, solver='liblinear')

# Evaluate model on training set

In [16]:
# To produce a ranking score, we ask the model to output
# predicted probability (.predict_proba method), instead of
# predicted class label (.predict method)
train_Y_hat = model.predict_proba(train_X)
train_Y = train_dataframe['CITED'].to_numpy()

# According to the documentation:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.predict_proba
# The predicted probablity for label '1' (CITED) is 
# the second column (column index = 1) returned by predict_proba (train_Y_hat)
wss95 = WSS_95(train_Y, train_Y_hat[:,1])
print ('Logistic regression, WSS@95% on training set:', wss95)

total_num_relevant 1465.0
curr_position 3709
Logistic regression, WSS@95% on training set: 0.6140398550724637


# Evaluate model on validation set

In [17]:
valid_Y_hat = model.predict_proba(valid_X)
valid_Y = valid_dataframe['CITED'].to_numpy()
wss95 = WSS_95(valid_Y, valid_Y_hat[:,1])
print ('Logistic regression, WSS@95% on validation set:', wss95)

total_num_relevant 598.0
curr_position 3224
Logistic regression, WSS@95% on validation set: 0.2686813186813186


In [18]:
# Here, by using logistic regression, we see a higher WSS@95%
# on the validation set (~0.27) than using random scores (~0.01). It is 
# a sanity check confirming that the logistic regression model can learn
# some useful ranking signals (performing better than random).

# Note that in this task, the performance on test data can be lower than 
# that on validadation set, because the test data and validation set do NOT
# come from the same underlying distribution.
# The validation set is a random subsample of candidate reference pool in 2013.
# The test data is the whole candidate reference pool in 2020, which has a
# different data distribution from 2013 as a result of topic shift in ozone research. 

# After experimentation on the validation set: retrain the final model on all training data, and predict scores for test data

In [19]:
all_train_Y = dataframe['CITED']
all_train_X = vectorizer.transform(dataframe['TITLE'] + ' ' + dataframe['ABSTRACT'])
model.fit(all_train_X, all_train_Y)
test_Y_hat = model.predict_proba(test_X)
write_test_prediction(test_dataframe, test_Y_hat[:,1], './logistic_regression-tfidf-trimmed-bigram.csv')

171376 predictions are written to ./logistic_regression-tfidf-trimmed-bigram.csv


# Investigate what the model has learned and where it failed (A.K.A. error analysis) ... YOUR TURN!