In [1]:
import wikipediaapi
from collections import defaultdict
import json
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
wiki_simple = wikipediaapi.Wikipedia(user_agent='Summary Comparison (taylor.ayla@protonmail.com)', language='simple')



Used language 'simple' is longer than 5. It is suspicious


In [3]:
simple_page = wiki_simple.page('Python_(programming_language)')
print(simple_page.summary)

Python is an open-source programming language. It was made as a language that is both easy to work on and understand. It was made by a Dutch programmer named Guido van Rossum in 1991, who named it after the television program Monty Python's Flying Circus.
Python is an interpreted language. This means it does not need to be compiled before running. A program called an interpreter runs Python code on almost any computer. So, a programmer can change the code and quickly see what happens. But this also makes Python slower than compiled languages like C, because it is not changed into machine code before running. Instead, this happens while the program is running.


In [4]:
wiki_regular = wikipediaapi.Wikipedia(user_agent='Summary Comparison (taylor.ayla@protonmail.com)', language='en')


In [5]:
regular_page = wiki_regular.page('Python_(programming_language)')
print(regular_page.summary)

Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
Python is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.
Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language, and he first released it in 1991 as Python 0.9.0. Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.
Python consistently ranks as one of the most popular programming languages, and it has gained widespread use in the machine learning community.


In [6]:
simple_wordlist = wiki_simple.page('Wikipedia:Basic_English_combined_wordlist')
print(simple_wordlist.summary)

This is the maximum Basic English combined wordlist. It is what the advanced student will know when moving from Basic English to the standard English language. So any student who knows all of these words has gone far beyond Basic English.
It actually contains well over 2,600 words and combines five separate word lists:

850 Basic English words.
179 international words:
50 international nouns.
12 names of sciences.
12 title and organizational names.
50 general utility.
5 onomatopoeic (sounds like) words.
50 words about time and numbers.
1293 words used as an addendum, more words for a better understanding.
215 compound words (made up of Basic English words).
91 new words made from adding the allowed endings: -er, -ed, -ing, -ly, -s, and the prefix un-.
Total: 2626 words.

Basic: a • able • about • account • acid • across • act • addition • adjustment • advertisement • agreement • after • again • against • air • all • almost • among • amount • amusement • angle • animal • answer • and • 

In [7]:
word_sections = defaultdict(list)
for line in simple_wordlist.text.split("\n"):
    section = line.split(":")
    if section[0].lower() in ["basic", "international", "addendum", "compound", "endings"]:
        word_sections[section[0]].append(section[1])

In [8]:
word_lists = {}
for section, lists in word_sections.items():
    section_list = []
    for words in lists:
        ascii_list = words.encode(encoding="ascii", errors="replace").decode("ascii").split("?")
        print(section, ascii_list[0][1], len(ascii_list))
        section_list += [word.strip() for word in ascii_list]
    print(section, len(section_list))
    word_lists[section] = section_list

Basic a 44
Basic b 66
Basic c 68
Basic d 43
Basic e 26
Basic f 49
Basic g 22
Basic h 32
Basic I 18
Basic j 6
Basic k 10
Basic l 37
Basic m 43
Basic n 25
Basic o 23
Basic p 59
Basic q 5
Basic r 39
Basic s 122
Basic t 53
Basic u 5
Basic v 7
Basic w 48
Basic y 6
Basic 856
International a 11
International b 8
International c 20
International d 6
International e 7
International F 10
International g 6
International h 7
International I 4
International J 4
International k 2
International l 5
International m 17
International n 5
International O 9
International p 24
International q 4
International r 7
International s 12
International t 25
International u 1
International v 6
International W 2
International z 3
International 205
Addendum a 38
Addendum b 30
Addendum c 67
Addendum d 45
Addendum e 29
Addendum f 40
Addendum g 22
Addendum h 20
Addendum i 24
Addendum j 9
Addendum k 4
Addendum l 33
Addendum m 34
Addendum n 14
Addendum o 12
Addendum p 47
Addendum q 2
Addendum r 35
Addendum s 89
Addendum t

In [9]:
for wordlist, words in word_lists.items():
    print(f"{wordlist}: {len(words)}")

Basic: 856
International: 205
Addendum: 660
Compound: 215
Endings: 89


In [10]:
word_lists["Endings"]

['actor',
 'acting',
 'basing',
 'based',
 'builder',
 'burner',
 'burned',
 'burning',
 'carter',
 'clothier',
 'clothing',
 'cooker',
 'cooked',
 'cooking',
 'crying',
 'dancer',
 'dancing (to)',
 'designer',
 'dressing (up)',
 'driver',
 'dropped',
 'dropper',
 'duster',
 'farmer',
 'fisher/fisherman',
 'folder',
 'fired',
 'firing',
 'gardener',
 'hanger',
 'heater',
 'heated',
 'heating',
 'inner',
 'jeweler',
 'joiner',
 'keeper',
 'laughing (at)',
 'learner',
 'locker',
 'locking (up)',
 'marked',
 'miner',
 'nearer',
 'noted',
 'outer',
 'painter',
 'painting',
 'parting',
 'playing',
 'played',
 'pleased (with)',
 'pointer',
 'pointing (at)',
 'potter',
 'printer',
 'prisoner',
 'producer',
 'raining',
 'reader',
 'reading',
 'roller',
 'ruler',
 'rubber',
 'sailor',
 'shocking',
 'shocked',
 'snowing',
 'steamer',
 'stopper',
 'stopping',
 'stopping up',
 'stretcher',
 'talking (of)',
 'teacher',
 'touching (up)',
 'trader',
 'trainer',
 'training',
 'troubling',
 'troubled',

In [11]:
with open("word_lists.json", "w") as outfile: 
    json.dump(word_lists, outfile)

In [12]:
summary_sents = regular_page.summary.split(".")


In [13]:
basic_word_count = 0
for sent in summary_sents: 
    sent = sent.strip()
    doc = nlp(sent)
    for token in doc:
        if token.lemma_ in word_lists["Basic"]:
            print(token.text)
            basic_word_count += 1
        # print(token.text, token.lemma_, token.tag_, token.dep_, token.morph)

print(basic_word_count)

is
a
high
level
general
purpose
language
design
with
the
use
of
is
and
supports
and
is
as
a
language
to
library
working
on
in
the
late
as
a
to
the
language
and
he
first
in
as
was
in
in
was
a
not
with
earlier
in
was
the
last
of
as
of
the
most
languages
and
has
use
in
the
machine
61


In [14]:
doc = nlp(regular_page.summary)


In [15]:
basic_word_count = 0
for sent in doc.sents:
    for token in sent: 
        if token.lemma_ in word_lists["Basic"]:
            basic_word_count += 1
print(basic_word_count)

61


In [16]:
from analysis import Analysis

In [17]:
regular_results = Analysis(regular_page.text)
regular_results.get_basic_rate()

0.3599630807096708

In [19]:
simple_results = Analysis(simple_page.text)
simple_results.get_basic_rate()

0.4836716681376876