In [1]:
pip install wikipedia

Note: you may need to restart the kernel to use updated packages.


In [2]:
# EXTRACTS THE DATA FROM WIKIPEDIA (we can change this to telegram or )

import pandas as pd
import wikipedia


def filter(titles):
    """
    Get the titles which are related to curve wars, given a list of titles
    """
    titles = [title for title in titles if 'defi' in title.lower() or 'decentralize' in title.lower()]

    return titles


def get_wiki_page(title):
    """
    Get the wikipedia page given a title
    """
    try:
        return wikipedia.page(title)
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        return None


def recursively_find_all_pages(titles, titles_so_far=set()):
    """
    Recursively find all the pages that are linked to the Wikipedia titles in the list
    """
    all_pages = []

    titles = list(set(titles) - titles_so_far)
    titles = filter(titles)
    titles_so_far.update(titles)
    for title in titles:
        page = get_wiki_page(title)
        if page is None:
            continue
        all_pages.append(page)

        new_pages = recursively_find_all_pages(page.links, titles_so_far)
        for pg in new_pages:
            if pg.title not in [p.title for p in all_pages]:
                all_pages.append(pg)
        titles_so_far.update(page.links)
    return all_pages

pages = recursively_find_all_pages(["Decentralized Finance"])
print((pages))

[<WikipediaPage 'Decentralized finance'>, <WikipediaPage 'Decentralized computing'>, <WikipediaPage 'Decentralized autonomous organization'>, <WikipediaPage 'Decentralized network 42'>, <WikipediaPage 'Decentralization'>, <WikipediaPage 'Definition of anarchism and libertarianism'>, <WikipediaPage 'Definitions of fascism'>, <WikipediaPage 'Definition of terrorism'>, <WikipediaPage 'Decentralised system'>, <WikipediaPage '0x (decentralized exchange infrastructure)'>, <WikipediaPage 'Decentralized application'>]


In [3]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
conda install nltk

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [5]:
import nlkt
nltk.download('punkt')

ModuleNotFoundError: No module named 'nlkt'

In [6]:
# SPLITTING THE DATA INTO SECTIONS

import re
from typing import Set
from transformers import GPT2TokenizerFast
import numpy as np
from nltk.tokenize import sent_tokenize


tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def reduce_long(
    long_text: str, long_text_tokens: bool = False, max_len: int = 590
) -> str:
    """
    Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
    """
    if not long_text_tokens:
        long_text_tokens = count_tokens(long_text)
    if long_text_tokens > max_len:
        sentences = sent_tokenize(long_text.replace("\n", " "))
        ntokens = 0
        for i, sentence in enumerate(sentences):
            ntokens += 1 + count_tokens(sentence)
            if ntokens > max_len:
                return ". ".join(sentences[:i][:-1]) + "."

    return long_text

discard_categories = ['See also', 'References', 'External links', 'Further reading', "Footnotes",
    "Bibliography", "Sources", "Citations", "Literature", "Footnotes", "Notes and references",
    "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources",
    "References and notes",]


def extract_sections(
    wiki_text: str,
    title: str,
    max_len: int = 1000,
    discard_categories: Set[str] = discard_categories,
) -> str:
    """
        Extract the sections of a Wikipedia page, discarding the the references and other low information sections
        """
    if len(wiki_text) == 0:
        return []

    # find all headings and the coresponding contents
    headings = re.findall("==+ .* ==+", wiki_text)
    for heading in headings:
        wiki_text = wiki_text.replace(heading, "==+ !! ==+")
    contents = wiki_text.split("==+ !! ==+")
    contents = [c.strip() for c in contents]
    assert len(headings) == len(contents) - 1

    cont = contents.pop(0).strip()
    outputs = [(title, "Summary", cont, count_tokens(cont) + 4)]

    # discard the discard categories, accounting for a tree structure
    max_level = 100
    keep_group_level = max_level
    remove_group_level = max_level
    nheadings, ncontents = [], []
    for heading, content in zip(headings, contents):
        plain_heading = " ".join(heading.split(" ")[1:-1])
        num_equals = len(heading.split(" ")[0])
        if num_equals <= keep_group_level:
            keep_group_level = max_level

        if num_equals > remove_group_level:
            if (
                    num_equals <= keep_group_level
            ):
                continue
        keep_group_level = max_level
        if plain_heading in discard_categories:
            remove_group_level = num_equals
            keep_group_level = max_level
            continue
        nheadings.append(heading.replace("=", "").strip())
        ncontents.append(content)
        remove_group_level = max_level

    # count the tokens of each section
    ncontent_ntokens = [
        count_tokens(c)
        + 3
        + count_tokens(" ".join(h.split(" ")[1:-1]))
        - (1 if len(c) == 0 else 0)
        for h, c in zip(nheadings, ncontents)
    ]

    # Create a tuple of (title, section_name, content, number of tokens)
    outputs += [(title, h, c, t) if t < max_len
                else (title, h, reduce_long(c, max_len), count_tokens(reduce_long(c, max_len)))
                for h, c, t in zip(nheadings, ncontents, ncontent_ntokens)]

    return outputs

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
res = []
for page in pages:
    #if not(extract_sections(page.content, page.title)):
    res += extract_sections(page.content, page.title)
df = pd.DataFrame(res, columns=["title", "heading", "extract_sections(page.content, page.title)content", "tokens"])
df = df[df.tokens>40]
df = df.drop_duplicates(['title','heading'])
df = df.reset_index().drop('index',axis=1) # reset index
df.head()
df.to_csv('ad_hoc_data.csv', index=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (1195 > 1024). Running this sequence through the model will result in indexing errors


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/eshanaagarwal/nltk_data'
    - '/Users/eshanaagarwal/opt/anaconda3/nltk_data'
    - '/Users/eshanaagarwal/opt/anaconda3/share/nltk_data'
    - '/Users/eshanaagarwal/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [5]:
extract_sections(pages[0].content, pages[0].title)

[('Decentralized finance',
  'Summary',
  'Decentralized finance (DeFi) offers financial instruments without relying on intermediaries such as brokerages, exchanges, or banks by using smart contracts on a blockchain. DeFi platforms allow people to lend or borrow funds from others, speculate on price movements on assets using derivatives, trade cryptocurrencies, insure against risks, and earn interest in savings-like accounts. DeFi uses a layered architecture and highly composable building blocks. Some applications promote high interest rates but are subject to high risk. As of February 2022, the value of assets used in decentralized finance amounted to $200 billion.',
  121),
 ('Decentralized finance',
  'History',
  'Decentralized exchanges (abbreviated DEXs) as alternative payment ecosystems with new protocols for financial transactions emerged within the framework of decentralized finance, which is part of blockchain technology and FinTech. Unlike centralized cryptocurrency exchange