In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import scipy as sp
from scipy import sparse
from scipy.sparse import csr_matrix

import sklearn
import sklearn.mixture

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import KNeighborsClassifier

import time
import pickle
import memory_profiler
%load_ext memory_profiler

We tried different representations and compared their kNN accuracy scores.

Representations:
- Word counts
- Log-transformed counts
- Term frequencies (TF)
- scRNA-seq approach
- Schmidt (2018)
- TF-IDF without log-scaling
- TF-IDF with log-scaling

## Import the data

In [2]:
# data
clean_df=pd.read_pickle("variables/clean_df")

# colors
automatic_colors=np.load("variables/automatic_colors.npy", allow_pickle=True)

# Representations

In [11]:
corpus = clean_df['AbstractText'].tolist()

### Word Counts (WC)

In [22]:
%%time
%%memit

# CountVectorizer
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(corpus)

peak memory: 151375.08 MiB, increment: 84294.96 MiB
CPU times: user 29min 24s, sys: 1min 38s, total: 31min 2s
Wall time: 31min 5s


In [24]:
# save results
sp.sparse.save_npz("variables/word_counts", word_counts)

In [3]:
word_counts = sp.sparse.load_npz("variables/word_counts.npz")

In [22]:
word_counts = sp.sparse.load_npz("/gpfs01/berens/user/rgonzalesmarquez/ICLR_workshop/new_variables/word_counts.npz")

### Log-transformed counts
log(1+WC)

In [25]:
log_1_WC=np.log1p(word_counts)

In [26]:
%%time
# save results
sp.sparse.save_npz("variables/log_1_WC", log_1_WC)

CPU times: user 11min, sys: 19.1 s, total: 11min 19s
Wall time: 11min 22s


In [3]:
log_1_WC = sp.sparse.load_npz("variables/log_1_WC.npz")

### Term frequencies (TF)

In [30]:
from sklearn.preprocessing import normalize
term_frequencies=normalize(word_counts, axis=1, norm='l1')

In [31]:
%%time
# save results
sp.sparse.save_npz("variables/term_frequencies", term_frequencies)

CPU times: user 9min 52s, sys: 23 s, total: 10min 15s
Wall time: 10min 17s


In [19]:
term_frequencies = sp.sparse.load_npz("variables/term_frequencies.npz")

### scRNA-seq approach
log(1+ TF*(average doc_len=100))

In [20]:
%%time
doc_len=np.diff(term_frequencies.indptr)

CPU times: user 17 ms, sys: 43.1 ms, total: 60.1 ms
Wall time: 57.8 ms


In [43]:
print('Average number of words per abstract: ', np.mean(doc_len))

Average number of words per abstract:  105.91494290058827


We approximate the average number of words per abstract to 100.

In [19]:
%%time
# doc_len=np.diff(term_frequencies.indptr)
TF_doc_len_term=csr_matrix.multiply(term_frequencies, 100) 
log_1_TF_doc_len = np.log1p(TF_doc_len_term)

CPU times: user 34.2 s, sys: 9.88 s, total: 44.1 s
Wall time: 44.2 s


In [20]:
# save results
sp.sparse.save_npz("variables/log_1_TF_doc_len", log_1_TF_doc_len)

In [2]:
log_1_TF_doc_len = sp.sparse.load_npz("variables/log_1_TF_doc_len.npz")

### Schmidt (2018) (modified) 
max([0, log(TF*100)])

In [25]:
#final
TF_100=csr_matrix.multiply(term_frequencies, 100) # if I multiply it by a scalar the format is still csr matrix
log_term=csr_matrix((np.log(TF_100.data), TF_100.indices, TF_100.indptr), shape=TF_100.shape)
schmidts_feature=csr_matrix.maximum(log_term,0)

In [26]:
# save results
sp.sparse.save_npz("variables/schmidts_feature", schmidts_feature)

In [22]:
schmidts_feature = sp.sparse.load_npz("variables/schmidts_feature.npz")

### TF-IDF without log-scaling

In [None]:
%%time
%%memit

vectorizer = TfidfVectorizer(sublinear_tf=False)
tfidf_sublinear_tf_False = vectorizer.fit_transform(corpus)

In [None]:
# save results
sp.sparse.save_npz("variables/tfidf_sublinear_tf_False", tfidf_sublinear_tf_False)

In [3]:
tfidf_sublinear_tf_False = sp.sparse.load_npz("variables/tfidf_sublinear_tf_False.npz")

### TF-IDF with log-scaling

In [None]:
%%time
%%memit

vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_features = vectorizer.fit_transform(corpus)

In [None]:
# save results
sp.sparse.save_npz("variables/tfidf_features", tfidf_features)

In [3]:
tfidf_features = sp.sparse.load_npz("variables/tfidf_features.npz")

# kNN accuracy

In [21]:
def get_knn_score_split(Zs, colors, k=10, subset_size=500, rs=42):
    """
    Gets the knn accuracy of the points with a color different than grey.
    """
    
    knn_scores=[]
    
    for i, Xrp in enumerate(Zs):
        n = np.sum(colors!='lightgrey')
        np.random.seed(rs)
        test = np.random.choice(n, size=subset_size, replace=False)
        train = np.setdiff1d(np.arange(n), test)

        neigh = KNeighborsClassifier(n_neighbors=10, algorithm='brute')
        neigh.fit(Xrp[colors!='lightgrey'][train], colors[colors!='lightgrey'][train])
        acc = np.mean(neigh.predict(Xrp[colors!='lightgrey'][test]) == colors[colors!='lightgrey'][test])
        knn_scores.append(acc)

    return knn_scores

### Word Counts (WC)

In [11]:
%%time
%%memit
knn_accuracy_WC= get_knn_score_split([word_counts] , automatic_colors, subset_size=500)

peak memory: 191280.11 MiB, increment: 21137.88 MiB
CPU times: user 13min 2s, sys: 1min 31s, total: 14min 33s
Wall time: 14min 35s


In [12]:
np.save("variables/knn_accuracy_WC",knn_accuracy_WC)

In [3]:
knn_accuracy_WC=np.load("variables/knn_accuracy_WC.npy")

### log(1+WC)

In [6]:
%%time
%%memit
knn_accuracy_log_1_WC= get_knn_score_split([log_1_WC] , automatic_colors, subset_size=500)

peak memory: 63677.96 MiB, increment: 16930.64 MiB
CPU times: user 12min 6s, sys: 55.6 s, total: 13min 2s
Wall time: 13min 3s


In [7]:
np.save("variables/knn_accuracy_log_1_WC",knn_accuracy_log_1_WC)

In [4]:
knn_accuracy_log_1_WC=np.load("variables/knn_accuracy_log_1_WC.npy")

### TF

In [None]:
%%time
%%memit
knn_accuracy_TF= get_knn_score_split([term_frequencies] , automatic_colors, subset_size=500)

In [None]:
np.save("variables/knn_accuracy_TF",knn_accuracy_TF)

In [5]:
knn_accuracy_TF=np.load("variables/knn_accuracy_TF.npy")

### log(1+ TF*(average doc_len=100))

In [22]:
%%time
%%memit
knn_accuracy_log_1_TF_doc_len= get_knn_score_split([log_1_TF_doc_len] , automatic_colors, subset_size=500)

peak memory: 111066.86 MiB, increment: 17393.99 MiB
CPU times: user 12min 26s, sys: 1min 9s, total: 13min 36s
Wall time: 13min 38s


In [23]:
np.save("variables/knn_accuracy_log_1_TF_doc_len",knn_accuracy_log_1_TF_doc_len)

In [7]:
knn_accuracy_log_1_TF_doc_len=np.load("variables/knn_accuracy_log_1_TF_doc_len.npy")

### Schmidt: max([0, log(TF*100)])

In [None]:
%%time
%%memit
knn_accuracy_schmidts_feature= get_knn_score_split([schmidts_feature] , automatic_colors, subset_size=500)

In [None]:
np.save("variables/knn_accuracy_schmidts_feature",knn_accuracy_schmidts_feature)

In [8]:
knn_accuracy_schmidts_feature=np.load("variables/knn_accuracy_schmidts_feature.npy")

### Tf-idf without sublinear scaling

In [5]:
%%time
%%memit
knn_accuracy_tfidf_sublinear_tf_False= get_knn_score_split([tfidf_sublinear_tf_False] , automatic_colors, subset_size=500)

peak memory: 38419.64 MiB, increment: 14842.84 MiB
CPU times: user 13min 49s, sys: 57 s, total: 14min 46s
Wall time: 14min 48s


In [6]:
np.save("variables/knn_accuracy_tfidf_sublinear_tf_False",knn_accuracy_tfidf_sublinear_tf_False)

In [9]:
knn_accuracy_tfidf_sublinear_tf_False=np.load("variables/knn_accuracy_tfidf_sublinear_tf_False.npy")

### Tf-idf with sublinear scaling

In [5]:
%%time
%%memit
knn_accuracy_tfidf= get_knn_score_split([tfidf_features] , automatic_colors, subset_size=500)

peak memory: 38419.64 MiB, increment: 14842.84 MiB
CPU times: user 13min 49s, sys: 57 s, total: 14min 46s
Wall time: 14min 48s


In [6]:
np.save("variables/knn_accuracy_tfidf",knn_accuracy_tfidf)

In [9]:
knn_accuracy_tfidf=np.load("variables/knn_accuracy_tfidf.npy")

## Print results

In [11]:
print('Word counts: ', knn_accuracy_WC)
print('log(1+ WC): ', knn_accuracy_log_1_WC)
print('TF: ', knn_accuracy_TF)
print('log(1 + TF*100): ', knn_accuracy_log_1_TF_doc_len) 
print('max([0, log(TF*100)]): ', knn_accuracy_schmidts_feature)
print('tf-idf (without log): ', knn_accuracy_tfidf_sublinear_tf_False)
print('tf-idf (with log): ', knn_accuracy_tfidf)

Word counts:  [0.494]
log(1+ WC):  [0.538]
TF:  [0.54]
log(1 + TF*100):  [0.616]
max([0, log(TF*100)]):  [0.428]
tf-idf (without log):  [0.628]
tf-idf (with log):  [0.712]


# Table

In [16]:
preprocessings = ['Word counts', 'Log-transformed counts', 'Term frequencies (TF)', 'scRNA-seq approach',
                  'Schmidt (2018)', 'TF-IDF without log scaling', 'TF-IDF with log scaling']

knn_accuracies= np.hstack((knn_accuracy_WC, knn_accuracy_log_1_WC, knn_accuracy_TF, 
                    knn_accuracy_log_1_TF_doc_len, knn_accuracy_schmidts_feature, knn_accuracy_tfidf_sublinear_tf_False, knn_accuracy_tfidf))
knn_accuracies

array([0.494, 0.538, 0.54 , 0.616, 0.428, 0.628, 0.712])

In [18]:
pd.DataFrame(knn_accuracies, preprocessings, columns= ['kNN accuracy'])

Unnamed: 0,kNN accuracy
Word counts,0.494
Log-transformed counts,0.538
Term frequencies (TF),0.54
scRNA-seq approach,0.616
Schmidt (2018),0.428
TF-IDF without log scaling,0.628
TF-IDF with log scaling,0.712
