Install packages

In [None]:
%%cmd
pip install nltk matplotlib seaborn requests

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import requests as _r
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import seaborn
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def get_content(name="Ozone_layer"):
    params = {
        'action': 'query',
        'format': 'json',
        'titles': name,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }
    api_ = _r.get('https://en.wikipedia.org/w/api.php', params=params)
    return api_.json()


In [None]:
def merge_contents(data):
    page = next(iter(data['query']['pages'].values()))
    return page['extract']


In [None]:
def tokenize(content):
    tokens = word_tokenize(content)
    
    return tokens

In [None]:
def lower_collection(collection):
    return [str(i).lower() for i in collection]

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(words, stop_words):
    filtered_sentence = [w for w in words if not w in stop_words and len(w) > 3]
    return filtered_sentence


In [None]:
def count_frequency(collection):
    return Counter(collection)

def print_most_frequent(frequencies, n=20):
    return sorted(frequencies.items(), key=lambda a: a[1], reverse=True)[:n]


In [None]:
name = input("Article name: ")
data = get_content(name)
text = merge_contents(data)
tokens = tokenize(text)
lowers = lower_collection(tokens)
no_stopwords = remove_stop_words(lowers, stop_words)
counts = count_frequency(no_stopwords)
results = print_most_frequent(counts)

fig, ax = plt.subplots(figsize=(14, 7))
x, y = [], []

for res in results:
    x.append(res[0])
    y.append(res[1])
    
x = np.array(x)
y = np.array(y)

ax = seaborn.barplot(x=y, y=x, palette="hsv", legend=False)
plt.title(f"The most frequency 20 words in {name if '_' not in name else ' '.join(name.split('_'))}")
plt.xlabel("Count")
plt.ylabel("Words")
plt.show()