In [1]:
# TODO: do some more preprocessing (so searching could get even better)
# from sklearn.feature_extraction.text import TfidfVectorizer

In [142]:
import importlib
importlib.util.find_spec('numpy')

ModuleSpec(name='numpy', loader=<_frozen_importlib_external.SourceFileLoader object at 0x000001C21B8B4708>, origin='e:\\Programs\\Anaconda\\envs\\net\\lib\\site-packages\\numpy\\__init__.py', submodule_search_locations=['e:\\Programs\\Anaconda\\envs\\net\\lib\\site-packages\\numpy'])

In [2]:
import os
import fitz
from bs4 import BeautifulSoup

def extract_text_from_files(filenames):
  documents = []
  for f in filenames: 
    if f.endswith('.html'): 
      try: 
        with open(f, 'r', encoding='utf8') as html: 
            soup = BeautifulSoup(html.read(), 'html.parser')
            documents.append(' '.join([p.text for p in soup.find_all('p')]))
      except: 
        continue
    elif f.endswith('.pdf'): 
        doc = fitz.open(f)
        documents.append(' '.join([p.get_text() for p in doc]))
    else: 
      with open(f, 'r', encoding='utf8') as file: 
        documents.append(file.read())
    # print(f)
  return documents

def walk(base, filter=lambda x: x.endswith('pdf') or x.endswith('html')):
  files = []
  for p, d, f in os.walk(base):
    for file in f:
      if filter(file):
        files.append(os.path.join(p, file))
  return files  


In [None]:
import re
import numpy as np 
from collections import defaultdict
import array
from scipy import sparse as sp

# source: "from nltk.corpus import stopwords; stopwords.words('english')"
stop_words = [ 
  'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
 "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',
 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it',
 "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is',
 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't",
 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren',
 "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
 "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
 "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
 "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
]

def tokenize(text):
  return re.findall(r"(?u)\b[a-zA-Z_][a-zA-Z_]+\b|\d+", text.lower())

def remove_stop_words(tokens, stop_wrods):
  inconsistent = []
  for token in tokens:
      if token not in stop_wrods:
        inconsistent.append(token)
  return inconsistent

_stopwords = list(np.unique(tokenize(' '.join(stop_words))))

def vectorize(documents, vocabulary={}):
  if len(vocabulary) == 0: 
    # Add a new value when a new vocabulary item is seen
    vocabulary = defaultdict(None, vocabulary)
    vocabulary.default_factory = vocabulary.__len__

  j_indices = []
  indptr = []

  values = array.array(str("i"))

  indptr.append(0)
  for i, doc in enumerate(documents): 
    tokens = tokenize(doc) # turn into tokens
    tokens = remove_stop_words(tokens, stop_words)

    feature_counter = defaultdict(lambda:0)
    for token in tokens:
      token_idx = vocabulary[token]
      feature_counter[token_idx] += 1

    j_indices.extend(feature_counter.keys())
    values.extend(feature_counter.values())
    indptr.append(len(j_indices))

  vocabulary = dict(vocabulary) # disable default dict behaviour

  indices_dtype = np.int32
  j_indices = np.asarray(j_indices, dtype=indices_dtype)
  indptr = np.asarray(indptr, dtype=indices_dtype)
  values = np.frombuffer(values, dtype=np.intc)

  X = sp.csr_matrix(
      (values, j_indices, indptr),
      shape=(len(indptr) - 1, len(vocabulary)),
      dtype=np.float32
  )

  X.sort_indices()
  return X, vocabulary

def calc_tf_idf(X):
  N = X.shape[0]
  tf  = X / (X.sum(axis=1, keepdims=True)+0.1)
  df  = (X > 0).sum(axis=0, keepdims=True)
  idf = np.log10(N / (1+df))+1
  return tf * idf

In [None]:

cache_filename ='index_cache.npz'

def load_cache():
  global cache_filename
  cache = np.load(cache_filename)
  cache.allow_pickle=True

  fts = dict(zip(cache['files'], cache['timestamps']))

  v = {}
  words = cache['vocab']
  for i, word in enumerate(words): 
    v[word] = i

  return fts, v, cache['matrix'] 

def save_cache(files, timestamps, vocabulary, matrix):
  assert len(files) == len(timestamps)
  np.savez_compressed(
    cache_filename, 
    files=np.asarray(files),
    timestamps=np.asarray(timestamps),
    vocab=np.asarray(list(vocabulary.keys())), 
    matrix=matrix
  )

if not os.path.exists(cache_filename):
  save_cache([], [], {}, np.array([]))

In [None]:
# save_cache([], [], {}, np.array([]))

In [None]:
fts, v, m = load_cache()

all_files = [] + \
  walk('C:\\Users\\bkand\\Downloads\\', lambda x: x.endswith('pdf')) + \
  walk('E:\\imp\\', lambda x: True) + \
  walk('E:\\archivebox\\archive\\', lambda x: x.endswith('.html') and x.find('.orig')==-1 and x.find('woff2')==-1)

new_files = []
modified_files = []
for f in all_files: 
  timestamp = fts.get(f)
  if timestamp: 
    if timestamp != os.path.getmtime(f):
      modified_files.append(f)
      print('mod', f)
  else: 
    if not f in fts.keys():
      print('new', f)
      new_files.append(f)

deleted_files = []
for f in fts.keys():
  if not f in all_files: 
    print('del', f)
    deleted_files.append(f)

In [None]:
files = list(fts.keys())

if len(new_files) > 0:
  # create count matrix for new files 
  new_documents = extract_text_from_files(new_files)
  new_mat, new_vocabulary = vectorize(new_documents, vocabulary=v)

  # update tf-idf matrix
  old_mat = sp.csr_matrix(m)
  old_mat.resize((old_mat.shape[0], new_mat.shape[1]))
  count_matrix = sp.vstack([old_mat, sp.csr_matrix(new_mat)]) # merge count matricies
  count_matrix = count_matrix.toarray()
  tf_idf = calc_tf_idf(count_matrix)

  # update vocabulary 
  new_idx = len(v.keys())
  for word in new_vocabulary.keys(): 
    idx = v.get(word)
    if idx is None:
      v[word] = new_idx
      new_idx += 1

  # update file names
  files += new_files

if len(deleted_files) > 0: 
  # remove unwanted columns from old tf-idf matrix 
  old_mat = m
  idxs = [files.index(df) for df in deleted_files]
  count_matrix = np.delete(old_mat, idxs, axis=0)   
  tf_idf = calc_tf_idf(count_matrix)

  # update file names 
  for idx in idxs:
    files.remove(files[idx])

if len(modified_files) > 0:
  # create count matrix for new files 
  mod_documents = extract_text_from_files(modified_files)
  mod_mat, mod_vocabulary = vectorize(mod_documents, vocabulary=v)

  # update tf-idf matrix
  count_matrix = m
  mod_mat = mod_mat.toarray()
  idxs = [files.index(mf) for mf in modified_files]
  for i, idx in enumerate(idxs):
    count_matrix[idx] = mod_mat[i]
  tf_idf = calc_tf_idf(count_matrix)

  # update vocabulary 
  mod_idx = len(v.keys())
  for word in mod_vocabulary.keys(): 
    idx = v.get(word)
    if idx is None:
      v[word] = mod_idx
      mod_idx += 1

In [None]:
if len(new_files + deleted_files + modified_files) > 0: 
  cache_filename ='index_cache.npz'
  timestamps = [os.path.getmtime(f) for f in files]
  save_cache(files, timestamps, v, count_matrix)

In [None]:
def get_files_from_query(X, query, vocabulary={}):

  Q, _ = vectorize([query], vocabulary=vocabulary)
  Q = calc_tf_idf(Q.toarray())

  def cosine_sim(a, b):
    d = np.linalg.norm(a)*np.linalg.norm(b)
    cos_sim = np.dot(a, b)/d if d > 0 else 0
    return cos_sim

  sim = {}
  # Calculate the similarity
  for i in range(X.shape[0]):
    sim[i] = cosine_sim(Q, X[i])

  # Sort the values 
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)

  # Print the articles and their similarity values
  for k, v in sim_sorted:
    if v != 0.0 and not np.isnan(v):
      print(f"Nilai Similaritas: {v.item():.4f} - {files[k].split('/')[-1]}")

query = 'memory'

get_files_from_query(tf_idf, query, vocabulary=v)

In [None]:
import numpy as np

arr = np.asarray([[0, 1], [2, 3]])
arr


In [None]:
np.c_[arr, np.array([0, 0])]

In [None]:
old_mat = np.ones((10, 100))
new_mat = np.arange((10* 102)).reshape((10, 102))

# old_mat.resize((old_mat.shape[0], new_mat.shape[1]))
# count_matrix = np.vstack([old_mat, new_mat]) # merge count matricies
# tf_idf = calc_tf_idf(count_matrix)
# for l in old_mat:
#   print(l)

# old_mat.resize((old_mat.shape[0], new_mat.shape[1]))
# count_matrix = np.vstack([old_mat, new_mat]) # merge count matricies
# for l in count_matrix:
#   print(l)
c = np.c_[old_mat, np.zeros((old_mat.shape[0], new_mat.shape[1]-old_mat.shape[1]))]
result = np.r_[c, new_mat]
result[-1]

In [None]:
import time
from threading import Thread

def get_files():
  all_files = []
  all_files += [] + \
    walk('C:\\Users\\bkand\\Downloads\\', lambda x: x.endswith('pdf')) + \
    walk('E:\\imp\\', lambda x: True) + \
    walk('E:\\archivebox\\archive\\', lambda x: x.endswith('.html') and x.find('.orig')==-1 and x.find('woff2')==-1)
  return all_files
  
def load_data():
  global fts, v, m 
  fts, v, m = load_cache()

start  = time.time()

CACHE_FILENAME = 'index_cache.npz'

class LoadFileNamesThread(Thread):
  def __init__(self, roots_and_filters, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.roots_and_filters = roots_and_filters
  def run(self):
    self.all_files = []
    for name, filter in self.roots_and_filters:
      for p, d, f in os.walk(name):
        for file in f:
          if filter(file): 
            self.all_files.append(p + file)

class LoadIndexThread(Thread):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
  def run(self):
    cache = np.load(CACHE_FILENAME)
    cache.allow_pickle=True

    self.m = cache['matrix']
    self.fts = dict(zip(cache['files'], cache['timestamps']))
    self.v = {}
    for i, word in enumerate(cache['vocab']): 
      self.v[word] = i

def save_cache(files, timestamps, vocabulary, matrix):
  assert len(files) == len(timestamps)
  np.savez_compressed(CACHE_FILENAME, 
    files=np.asarray(files), timestamps=np.asarray(timestamps),
    vocab=np.asarray(list(vocabulary.keys())), matrix=matrix
  )

thread = Thread(target=save_cache, args=(), daemon=True)
thread.start()

roots_and_filters = [
  ['C:\\Users\\bkand\\Downloads\\', lambda x: x.endswith('pdf')],
  ['E:\\imp\\', lambda x: True],
  ['E:\\archivebox\\archive\\', lambda x: x.endswith('.html') and x.find('.orig')==-1 and x.find('woff2')==-1]
]

t1 = LoadFileNamesThread(roots_and_filters)
t2 = LoadIndexThread()
t1.start(); t2.start()
t2.join(); t1.join()
multi_threaded_time = time.time() - start

start  = time.time()
fts, v, m = load_cache()
get_files()
single_threaded_time = time.time() - start


print(f'single - {single_threaded_time:.4f}\nmulti - {multi_threaded_time:.4f}\ndiff - {abs(single_threaded_time - multi_threaded_time):.4f}')


In [2]:
import os
import fitz
from bs4 import BeautifulSoup


def walk(base, filter=lambda x: x.endswith('pdf') or x.endswith('html')):
  files = []
  for p, d, f in os.walk(base):
    for file in f:
      if filter(file):
        files.append(os.path.join(p, file))
  return files  




In [22]:

files = walk('C:\\Users\\bkand\\Downloads')
# files = walk('E:\\archivebox\\archive')
# extract_text_from_files(files[3])

f = 'C:\\Users\\bkand\\Downloads\\Tainter_The_Collapse_of_Complex_Societies.pdf'


# print(f)
# with open(f, 'r', encoding='utf8') as html: 
#   soup = BeautifulSoup(html.read(), 'html.parser')
#   print(soup.find('title').text)
#   print([p.text for p in soup.find_all('h1')])
#   print([p.text for p in soup.find_all('h2')])


# print(f)
# doc = fitz.open(f)
# print(f'author: {doc.metadata.get("author")}\ntitle: {doc.metadata["title"]}')
# doc[5].get_text() 
# documents

In [None]:
n = 2
l = []
for elem in l: 
  if elem % n:
    l.remove(elem)
