In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from collections import defaultdict
import random
import gzip
import urllib
import scipy.optimize
import string
import ast
from tqdm.notebook import tqdm
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rubyren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
cs_df = pd.read_csv('cs/cs.AI.csv')
cs_df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...
1,1,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...
2,2,The World as Evolving Information,This paper discusses the benefits of describ...
3,3,Architecture for Pseudo Acausal Evolvable Embe...,Advances in semiconductor technology are con...
4,4,A neural network approach to ordinal regression,Ordinal regression is an important type of l...


# TF

In [5]:
punct = string.punctuation
stop_en = stopwords.words('english')

def find_comm(data):
    all_words = defaultdict(int)
    tokenized = []
    stemmer = nltk.stem.porter.PorterStemmer()
    for review in tqdm(data['abstract']):
        # remove capitalization
        review = review.lower()
        # remove punctuation
        review = [c for c in review if not (c in punct)] 
        review = ''.join(review)
        # tokenize the text
        words = review.strip().split()
        words = [stemmer.stem(w) for w in words]
        words = [w for w in words if not (w in stop_en)]
        tokenized.append(words)
        # count frequency
        for w in words:
            all_words[w] += 1
    return all_words, tokenized

In [None]:
all_words_train, tokenized_train = find_comm(cs_df)
words = all_words_train.keys()
freq = all_words_train.values()
comm = pd.DataFrame(list(zip(words, freq)), columns=['word', 'frequency'])
comm = comm.sort_values(by='frequency', ascending=False).reset_index(drop=True)


In [8]:
cs_df['tokenized'] = tokenized_train

def feature(data, dict_size):
    lst = []
    word_dict = comm['word'].to_list()[:dict_size]
    stemmer = nltk.stem.porter.PorterStemmer()
    for review in tqdm(data['abstract']):
        feat = [0] * dict_size
        # count the instances of the words in each review
        for word in review:
            word = stemmer.stem(word)
            if word in word_dict:
                feat[word_dict.index(word)] += 1
        lst.append(feat)
    return lst

In [84]:
%%time
feature_len = 10000
# build bag-of-words feature vectors
#X_tr = feature(cs_df, feature_len)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


In [12]:
cs_df['bow_feature'] = X_tr
#valid['predictions'] = logmodel.predict(X_v)
cs_df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,tokenized,bow_feature
0,0,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...,"[intellig, acoust, emiss, locat, describ, part...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...,"[part, describ, intellig, acoust, emiss, locat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,The World as Evolving Information,This paper discusses the benefits of describ...,"[thi, paper, discuss, benefit, describ, world,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,Architecture for Pseudo Acausal Evolvable Embe...,Advances in semiconductor technology are con...,"[advanc, semiconductor, technolog, contribut, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,A neural network approach to ordinal regression,Ordinal regression is an important type of l...,"[ordin, regress, import, type, learn, ha, prop...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [117]:
def cal_tf(df):
    df['sum_words'] = df['bow_feature'].apply(lambda x: sum(x))
    tf_lst = []
    for i in tqdm(range(df.shape[0])):
        bow = df.loc[i, 'bow_feature']
        sum_words = df.loc[i, 'sum_words']
        if sum_words != 0:
            tf = [f/sum_words for f in bow]
            tf_lst.append(tf)
        else:
            tf_lst.append([0]*feature_len)

    df['tf'] = tf_lst
    
    return df

In [15]:
cs_df = cal_df(cs_df)
cs_df.head()

HBox(children=(FloatProgress(value=0.0, max=28061.0), HTML(value='')))




Unnamed: 0.1,Unnamed: 0,title,abstract,tokenized,bow_feature,sum_words,tf
0,0,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...,"[intellig, acoust, emiss, locat, describ, part...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...,"[part, describ, intellig, acoust, emiss, locat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",76,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,The World as Evolving Information,This paper discusses the benefits of describ...,"[thi, paper, discuss, benefit, describ, world,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",63,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,Architecture for Pseudo Acausal Evolvable Embe...,Advances in semiconductor technology are con...,"[advanc, semiconductor, technolog, contribut, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,A neural network approach to ordinal regression,Ordinal regression is an important type of l...,"[ordin, regress, import, type, learn, ha, prop...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",75,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# IDF

In [16]:
def cal_idf(df):
    count_words = [0]*feature_len
    for i in tqdm(range(df.shape[0])):
        bow = df.loc[i, 'bow_feature']
        have_words = [1 if x > 0 else 0 for x in bow]
        count_words = [i+j for i,j in zip(count_words, have_words)]
    N = df.shape[0]
    idf = [np.log(N/(x+1)) for x in count_words]
    
    return idf

In [17]:
def cal_tfidf(df, idf):
    tf_idf = []
    for i in tqdm(range(df.shape[0])):
        tf = df.loc[i, 'tf']
        tfidf = [i*j for i,j in zip(tf, idf)]
        tf_idf.append(tfidf)
    df['tf_idf'] = tf_idf
    
    return df

In [18]:
idf_tr = cal_idf(cs_df)
cs_df = cal_tfidf(cs_df, idf_tr)
cs_df.head()

HBox(children=(FloatProgress(value=0.0, max=28061.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28061.0), HTML(value='')))




Unnamed: 0.1,Unnamed: 0,title,abstract,tokenized,bow_feature,sum_words,tf,tf_idf
0,0,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...,"[intellig, acoust, emiss, locat, describ, part...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...,"[part, describ, intellig, acoust, emiss, locat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",76,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,The World as Evolving Information,This paper discusses the benefits of describ...,"[thi, paper, discuss, benefit, describ, world,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",63,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,Architecture for Pseudo Acausal Evolvable Embe...,Advances in semiconductor technology are con...,"[advanc, semiconductor, technolog, contribut, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,A neural network approach to ordinal regression,Ordinal regression is an important type of l...,"[ordin, regress, import, type, learn, ha, prop...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",75,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [20]:
cs_df['mean'] = cs_df['tf_idf'].apply(np.sum)
cs_df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,tokenized,bow_feature,sum_words,tf,tf_idf,mean
0,0,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...,"[intellig, acoust, emiss, locat, describ, part...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000692
1,1,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...,"[part, describ, intellig, acoust, emiss, locat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",76,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000663
2,2,The World as Evolving Information,This paper discusses the benefits of describ...,"[thi, paper, discuss, benefit, describ, world,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",63,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000786
3,3,Architecture for Pseudo Acausal Evolvable Embe...,Advances in semiconductor technology are con...,"[advanc, semiconductor, technolog, contribut, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.001135
4,4,A neural network approach to ordinal regression,Ordinal regression is an important type of l...,"[ordin, regress, import, type, learn, ha, prop...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",75,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000551


# Read Txt

In [21]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import pdfbox
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer 

In [112]:
# turn input paper to a 1-d array

autophrase_f = open("autophrase.txt", "r")
construct_f = open("construct.txt", "r")

data = pd.DataFrame()
data['abstract'] = pd.Series([autophrase_f.read(), construct_f.read()])
data

Unnamed: 0,abstract
0,Automated Phrase Mining from Massive Text Corp...
1,"Proceedings of the BioNLP 2019 workshop, pages..."


In [29]:
comm[:5]

Unnamed: 0,word,frequency
0,thi,40828
1,model,33482
2,learn,30998
3,use,28622
4,algorithm,19255


In [113]:
all_words, tokenized = find_comm(data)
data['tokenized'] = tokenized
data

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,abstract,tokenized
0,Automated Phrase Mining from Massive Text Corp...,"[autom, phrase, mine, massiv, text, corpora, j..."
1,"Proceedings of the BioNLP 2019 workshop, pages...","[proceed, bionlp, 2019, workshop, page, 142–15..."


In [114]:
print(feature_len)
X_v = feature(data, feature_len)
data['bow_feature'] = X_v
#valid['predictions'] = logmodel.predict(X_v)
data.head()

10000


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,abstract,tokenized,bow_feature
0,Automated Phrase Mining from Massive Text Corp...,"[autom, phrase, mine, massiv, text, corpora, j...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"Proceedings of the BioNLP 2019 workshop, pages...","[proceed, bionlp, 2019, workshop, page, 142–15...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [118]:
data = cal_tf(data)
data.head()

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,abstract,tokenized,bow_feature,sum_words,tf
0,Automated Phrase Mining from Massive Text Corp...,"[autom, phrase, mine, massiv, text, corpora, j...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",28010,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"Proceedings of the BioNLP 2019 workshop, pages...","[proceed, bionlp, 2019, workshop, page, 142–15...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",17900,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [119]:
data = cal_tf(data)
data['idf'] = cal_idf(data)
data.head()

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




ValueError: Length of values does not match length of index

In [103]:
data['sum'] = data['tf_idf'].apply(np.sum)
data.head()

Unnamed: 0,abstract,tokenized,bow_feature,sum_words,tf,tf_idf,sum
0,Automated Phrase Mining from Massive Text Corp...,"[autom, phrase, mine, massiv, text, corpora, j...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",28010,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-0.405465
1,"Proceedings of the BioNLP 2019 workshop, pages...","[proceed, bionlp, 2019, workshop, page, 142–15...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",17900,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-0.405465


In [58]:
data.loc[0]['bow_feature'][0]

0

In [105]:
comm_words = []
bow_1 = data.loc[0]['tf_idf']
bow_2 = data.loc[1]['tf_idf']
bow_comm = [[i, j] if i*j != 0 else None for i, j in zip(bow_1, bow_2)]


In [106]:
common_lst = []
for i in range(10000):
    if bow_comm[i] != None:
        print(bow_comm[i])
        

[-0.00560210627768153, -0.003964044353012781]
[-0.0032859899871672017, -0.003420403984599599]
[-0.0022582133832514694, -0.0017441795153256235]
[-0.05409579111032525, -0.05520214907595512]
[-0.009800067054238748, -0.01030651531783323]
[-0.003054378358115769, -0.004122606127133293]
[-0.0013172911402300237, -0.001857437925411703]
[-0.022784794007934695, -0.025551097315419526]
[-0.0013028154134143092, -0.0019027412894461347]
[-0.021568832955414674, -0.021904176510647768]
[-0.003691310338007209, -0.005527010412200677]
[-0.0026201065536443327, -0.000611595414464829]
[-0.04581567537173654, -0.04623208299713763]
[-0.0011580581452571637, -0.0011778874648952263]
[-0.0012593882329671655, -0.0017894828793600553]
[-0.01467838699113455, -0.015992087504154418]
[-0.09134183620715879, -0.0933702332749639]
[-0.000897495062574302, -0.0012684941929640898]
[-0.017790668256513178, -0.012888807067795842]
[-0.029472579796794816, -0.025233973767178505]
[-0.0033438928944300597, -0.0054137520021145975]
[-0.00091

In [115]:
np.mean([i >= 0 for i in data.loc[0]['bow_feature']])

1.0

In [81]:
word_dict = comm['word'].to_list()[:10000]

In [83]:
len(word_dict)

10000