In [1]:
import os
import numpy as np
import pandas as pd
import string
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
from sklearn.pipeline import Pipeline
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/jfan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
files = os.listdir()
text_lst = []
for file in files:
    if file[:2] == 'cs':
        csv = pd.read_csv(file)
        text_lst += csv['title'].to_list()
        text_lst += csv['abstract'].to_list()

In [2]:
text_lst = []
csv = pd.read_csv('cs/cs.AI.csv', index_col='Unnamed: 0')
csv.head()

Unnamed: 0,title,abstract
0,Intelligent location of simultaneously active ...,The intelligent acoustic emission locator is...
1,Intelligent location of simultaneously active ...,Part I describes an intelligent acoustic emi...
2,The World as Evolving Information,This paper discusses the benefits of describ...
3,Architecture for Pseudo Acausal Evolvable Embe...,Advances in semiconductor technology are con...
4,A neural network approach to ordinal regression,Ordinal regression is an important type of l...


In [3]:
text_lst += csv['title'].to_list()
text_lst += csv['abstract'].to_list()

In [4]:
punct = string.punctuation
stop_words = set(stopwords.words('english')) 

def find_comm(data):
    all_words = defaultdict(int)
    tokenized = []
    plain_text = []
    stemmer = nltk.stem.porter.PorterStemmer()
    for review in tqdm(data):
        # remove capitalization
        review = review.lower()
        # remove punctuation
        review = [c for c in review if not (c in punct)] 
        review = ''.join(review)
        plain_text.append(review)
        # tokenize the text
        words = review.strip().split()
        words = [stemmer.stem(w) for w in words]
        tokenized.append(words)
        # count frequency
        for w in words:
            if w not in stop_words: 
                all_words[w] += 1
    return all_words, tokenized, plain_text

def tf_idf(data, dict_size):
    lst = []
    word_dict = comm['word'].to_list()[:dict_size]
    idf = comm['idf'].to_list()[:dict_size]
    stemmer = nltk.stem.porter.PorterStemmer()
    for review in tqdm(data):
        feat = [0] * dict_size
        n_terms = 0
        # count the instances of the words in each review
        for word in review:
            n_terms += 1
            word = stemmer.stem(word)
            if word in word_dict:
                feat[word_dict.index(word)] += 1
        feat = [feat[i] / n_terms if n_terms != 0 else 0 for i in range(dict_size)]
        feat = [feat[i] * idf[i] for i in range(dict_size)]
        lst.append(feat)
    return lst

In [5]:
all_words_domain, tokenized_domain, plain_text_domain = find_comm(text_lst)
words = all_words_domain.keys()
freq = all_words_domain.values()
comm = pd.DataFrame(list(zip(words, freq)), columns=['word', 'frequency'])
comm = comm.sort_values(by='frequency', ascending=False).reset_index(drop=True)
pipe = Pipeline([('count', CountVectorizer(vocabulary=words)), ('tfid', TfidfTransformer())]).fit(plain_text_domain)
comm['idf'] = pipe['tfid'].idf_
comm

HBox(children=(FloatProgress(value=0.0, max=56122.0), HTML(value='')))




Unnamed: 0,word,frequency,idf
0,thi,40838,11.935301
1,learn,37091,11.935301
2,model,36130,11.935301
3,use,30835,11.935301
4,algorithm,20443,11.935301
...,...,...,...
61018,kp2,1,11.935301
61019,locallyconsist,1,11.935301
61020,centerperipheri,1,11.242154
61021,3277,1,11.935301


In [6]:
f = open('autophrase.txt', 'r')
input_paper = [f.readline()]
all_words_input, tokenized_input, plain_text_input = find_comm(input_paper)
tfidf_input = tf_idf(tokenized_input, 1000)[0]

f = open('construct.txt', 'r')
recommended_paper = [f.readline()]
all_words_recommended, tokenized_recommended, plain_text_recommended = find_comm(recommended_paper)
tfidf_recommended = tf_idf(tokenized_recommended, 1000)[0]

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [7]:
tfidf = pd.DataFrame()
tfidf['words'] = comm['word'].iloc[:1000]
tfidf['input'] = tfidf_input
tfidf['recommended'] = tfidf_recommended
tfidf

Unnamed: 0,words,input,recommended
0,thi,0.045887,0.074392
1,learn,0.011472,0.019959
2,model,0.041299,0.021773
3,use,0.029827,0.058062
4,algorithm,0.011472,0.001814
...,...,...,...
995,categor,0.000000,0.000000
996,technic,0.002294,0.000000
997,prototyp,0.000000,0.001173
998,qlearn,0.000000,0.000000
