In [1]:
import bs4
import glob
import json
import itertools
import pandas as pd
import re
import readability
import string
import sys
import typing

from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


topics = {}

for file in glob.glob('./data/*.json'):
    with open(file, 'r') as rfile:
        content = json.load(rfile)
        topics[content['steps'][0]['id']] = content['steps'][0]

In [2]:
def make_soup() -> None:
    soups = {}
    for _id, block in topics.items():
        text = block['block']['text']
        text = text[4:] if text.startswith('</p>') else text
        text = text[5:] if text.startswith('</ul>') else text
        soups[_id] = bs4.BeautifulSoup(text, features='lxml')
    return soups
        
soups = make_soup()

In [3]:
SPECIAL_SECTIONS = [
    re.compile(
        r'\s*<code[^>]*>[\w\W]*?</code>\s*',
        flags=re.MULTILINE,
    ),
    re.compile(
        r'\s*(<em>|<strong>)*<span class="math-tex">[\w\W]*?</span>(</em>|</strong>)*\s*',
        flags=re.MULTILINE,
    ),
]

def get_words(soup: bs4.BeautifulSoup) -> typing.List[str]:
    punct = string.punctuation.replace("'", "").replace("-", "")
    text = remove_special_sections(soup.find('body').decode_contents())
    words: typing.List[str] = []
    for word in re.sub(f'[{punct}]', ' ', text).split():
        if word[:1].islower() and len(word) > 2 and '\u200b' not in word:
            words.append(word if "'" not in word else word[:word.index("'")])

    return list(words)


def get_sentences(soup: bs4.BeautifulSoup) -> typing.List[str]:
    text = remove_special_sections(soup.find('body').decode_contents())
    return re.split('[!?.:]', text)


def remove_special_sections(content: str) -> str:
    content = content.replace('  ', '  ')
    for special in SPECIAL_SECTIONS:
        content = special.sub(' ', content)

    soup = bs4.BeautifulSoup(f'<div>{content}</div>', features='lxml')
    return soup.find('div').text


def shorten(text: str) -> str:
    if len(text) > 80:
        return text[:80] + '...'
    return text

In [4]:
def length_of_raw_text(_id: int) -> int:
    return len(soups[_id].text)

def number_of_sections(_id: int) -> int:
    return len(topics[_id]['block']['table_of_contents'])

def average_like(_id: int) -> float:
    count, value = 0, 0
    for likes in topics[_id]['likes_statistics']:
        if not likes['subject']:
            count += likes['total_count']
            value += likes['total_count'] * likes['value']
    if count == 0:
        return -10
    return value / count

def likes_count(_id: int) -> float:
    count = 0
    for likes in topics[_id]['likes_statistics']:
        if not likes['subject']:
            count += likes['total_count']
    return count

def number_of_alerts(_id: int) -> int:
    return len(soups[_id].findAll('div', {'class': 'alert'}))

def number_of_bold_elements(_id: int) -> int:
    return len(soups[_id].findAll('strong')) + len(soups[_id].findAll('b'))

def number_of_italics(_id: int) -> int:
    return len(soups[_id].findAll('em')) + len(soups[_id].findAll('i'))

def number_of_images(_id: int) -> int:
    return len(soups[_id].findAll('img'))

def number_of_snippets(_id: int) -> int:
    return len(soups[_id].select('pre code'))

def number_of_code_elements(_id: int) -> int:
    return len(soups[_id].select('code'))

def number_of_external_links(_id: int) -> int:
    return len(soups[_id].select('a'))

def average_paragraph_length(_id: int) -> int:
    ps = soups[_id].findAll('p')
    return sum(len(p.text) for p in ps) / len(ps)

def max_paragraph_length(_id: int) -> int:
    ps = soups[_id].findAll('p')
    return max(len(p.text) for p in ps)

def number_of_tables(_id: int) -> int:
    return len(soups[_id].select('table'))

def length_of_text(_id: int) -> int:
    return len(remove_special_sections(soups[_id].find('body').decode_contents()))

def average_word_length(_id: int) -> int:
    words = get_words(soups[_id])
    return sum(map(len, words)) / len(words)

def number_of_long_words(_id: int) -> int:
    words = get_words(soups[_id])
    return len(list(filter(lambda word: len(word) > 12, words)))

def dale_chall_score(_id: int) -> int:
    text = remove_special_sections(soups[_id].find('body').decode_contents())
    return readability.Readability(text).dale_chall().score
    
def flesch_score(_id: int) -> int:
    text = remove_special_sections(soups[_id].find('body').decode_contents())
    return readability.Readability(text).flesch().score

def average_sentence_length(_id: int) -> int:
    sentences = get_sentences(soups[_id])
    return sum(map(len, sentences)) / len(sentences)

def number_of_long_sentences(_id: int) -> int:
    sentences = get_sentences(soups[_id])
    return len(list(filter(lambda sentence: len(sentence) > 12, sentences)))

def average_section_length(_id: int) -> int:
    text = soups[_id].decode_contents()
    count, total_length = 0, 0
    start = end = text.find('<h2')
    while end > 0:
        end = text.find('<h2', start + 1)
        if end == -1 and (
            text.find('Conclusion', start + 1) > 0 or text.find('Summary', start + 1) > 0):
            break
        count += 1
        total_length += end - start if end != -1 else len(text) - start
        start = end
    return total_length / count if count > 0 else 1000

def max_section_length(_id: int) -> int:
    text = soups[_id].decode_contents()
    max_length = 0
    start = end = text.find('<h2')
    while end > 0:
        end = text.find('<h2', start + 1)
        length = end - start if end != -1 else len(text) - start
        max_length = max(max_length, length)
        start = end
    return max_length if max_length > 0 else 1000

def sentiment_score(_id: int) -> float:
    text = remove_special_sections(soups[_id].find('body').decode_contents())
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(text)
    return sentiment_dict['compound']

In [None]:
prepared = {}

with tqdm(total=len(soups), file=sys.stdout) as pbar:
    for _id in soups.keys():
        pbar.update(1)
        prepared[_id] = {
            'id': _id,
            'length_of_raw_text': length_of_raw_text(_id),
            'number_of_sections': number_of_sections(_id),
            'average_like': average_like(_id),
            'number_of_alerts': number_of_alerts(_id),
            'number_of_bold_elements': number_of_bold_elements(_id),
            'number_of_italics': number_of_italics(_id),
            'number_of_images': number_of_images(_id),
            'number_of_snippets': number_of_snippets(_id),
            'number_of_code_elements': number_of_code_elements(_id),
            'number_of_links': number_of_external_links(_id),
            'average_paragraph_length': average_paragraph_length(_id),
            'max_paragraph_length': max_paragraph_length(_id),
            'number_of_tables': number_of_tables(_id),
            'length_of_text': length_of_text(_id),
            'average_word_length': average_word_length(_id),
            'number_of_long_words': number_of_long_words(_id),
            'dale_chall_score': dale_chall_score(_id),
            'flesch_score': flesch_score(_id),
            'average_sentence_length': average_sentence_length(_id),
            'number_of_long_sentences': number_of_long_sentences(_id),
            'average_section_length': average_section_length(_id),
            'max_section_length': max_section_length(_id),
            'sentiment_score': sentiment_score(_id),
        }

 91%|█████████▏| 971/1064 [01:32<00:10,  8.82it/s]

In [9]:
MARGIN = 1.2

try:
    df = pd.DataFrame(prepared.values())
except:
    with open('prepared.json') as rfile:
        df = pd.DataFrame.from_dict(json.load(rfile))

df = df[(df.average_like != -10) & (df.likes_count > 10)]

X = df.drop(
    labels=[
        'id',
        'average_like',
        'length_of_raw_text',
        'number_of_long_sentences',
        'max_section_length',
    ],
    axis=1,
)

y = df.average_like >= MARGIN

In [10]:
df[df.average_like < MARGIN].describe().iloc[:, 1:13]

Unnamed: 0,length_of_raw_text,number_of_sections,average_like,number_of_alerts,number_of_bold_elements,number_of_italics,number_of_images,number_of_snippets,number_of_code_elements,number_of_links,average_paragraph_length,max_paragraph_length
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,6744.666667,5.027778,0.857124,0.722222,6.861111,2.75,2.916667,4.472222,19.833333,0.277778,177.330206,562.861111
std,1965.652665,1.963759,0.324745,1.185896,7.691007,5.688962,4.016928,5.944919,23.805762,0.566246,53.951245,223.070924
min,3693.0,0.0,-0.096774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.181818,271.0
25%,5482.75,4.0,0.788194,0.0,1.0,0.0,0.0,0.0,0.0,0.0,144.684444,423.0
50%,6397.5,5.0,0.93394,0.0,4.0,0.0,1.5,1.0,8.5,0.0,178.520588,494.5
75%,7964.25,5.0,1.077279,1.0,8.25,3.0,3.25,8.0,35.25,0.0,206.917293,651.25
max,11664.0,12.0,1.181818,4.0,34.0,30.0,15.0,21.0,74.0,2.0,297.444444,1508.0


In [11]:
df[df.average_like >= MARGIN].describe().iloc[:, 1:13]

Unnamed: 0,length_of_raw_text,number_of_sections,average_like,number_of_alerts,number_of_bold_elements,number_of_italics,number_of_images,number_of_snippets,number_of_code_elements,number_of_links,average_paragraph_length,max_paragraph_length
count,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0
mean,5058.473743,4.365363,1.742635,0.841341,7.251397,2.807821,1.36648,6.022346,23.788827,0.730726,172.709955,475.153073
std,2101.70448,1.639667,0.147162,1.168887,7.115729,5.925615,2.499171,4.811381,20.417923,1.506179,54.572664,168.020799
min,1034.0,0.0,1.209459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.115385,190.0
25%,3573.5,3.0,1.696529,0.0,2.0,0.0,0.0,2.0,7.0,0.0,135.086081,368.0
50%,4733.0,4.0,1.780488,0.0,5.0,0.0,0.0,6.0,20.0,0.0,165.642857,447.0
75%,6265.5,5.0,1.837838,1.0,10.0,3.0,2.0,9.0,36.0,1.0,201.786199,555.5
max,15352.0,18.0,2.0,9.0,52.0,50.0,16.0,30.0,120.0,16.0,514.866667,2877.0


In [12]:
df[df.average_like < MARGIN].describe().iloc[:, 13:]

Unnamed: 0,number_of_tables,length_of_text,average_word_length,number_of_long_words,dale_chall_score,flesch_score,average_sentence_length,number_of_long_sentences,average_section_length,max_section_length,sentiment_score,likes_count
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,0.305556,4697.805556,5.412105,5.388889,9.088161,56.788203,80.149293,55.055556,2050.42044,3301.638889,0.836503,80.416667
std,0.950772,1755.718474,0.307296,5.066855,0.761368,7.921152,10.73302,19.530115,635.692379,1098.146078,0.394893,72.549638
min,0.0,2393.0,4.904882,0.0,7.795456,44.234117,59.140625,28.0,1000.0,1000.0,-0.9889,11.0
25%,0.0,3675.5,5.203039,2.0,8.546546,50.314458,73.542057,44.0,1546.375,2687.25,0.91685,31.75
50%,0.0,4128.0,5.363191,3.0,8.933634,57.050337,77.617788,50.5,2064.791667,3303.0,0.985,52.0
75%,0.0,4986.75,5.631784,8.0,9.684564,62.916469,84.518431,58.5,2473.3125,4034.0,0.992725,99.0
max,5.0,10534.0,6.131765,25.0,10.7472,73.022377,109.043478,116.0,3750.0,6368.0,0.999,308.0


In [13]:
df[df.average_like >= MARGIN].describe().iloc[:, 13:]

Unnamed: 0,number_of_tables,length_of_text,average_word_length,number_of_long_words,dale_chall_score,flesch_score,average_sentence_length,number_of_long_sentences,average_section_length,max_section_length,sentiment_score,likes_count
count,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0
mean,0.175419,3906.910615,5.400512,4.249162,8.837966,57.844118,78.137718,46.53743,1743.404011,2551.953073,0.798569,411.959777
std,0.938524,1567.667853,0.305238,5.052901,0.749137,8.314127,12.046821,18.402062,703.5061,1122.184937,0.484833,1107.112165
min,0.0,850.0,4.548951,0.0,6.58173,14.837535,29.837209,7.0,506.5,748.0,-0.9977,11.0
25%,0.0,2744.5,5.190424,1.0,8.334032,52.529273,70.665306,33.0,1265.041667,1782.0,0.9217,59.0
50%,0.0,3730.0,5.385343,3.0,8.809471,58.434094,77.578947,45.0,1597.75,2358.0,0.9805,142.0
75%,0.0,4862.5,5.591288,6.0,9.296189,63.795097,84.871711,57.0,2088.0,3132.5,0.9936,391.0
max,15.0,13339.0,6.37561,39.0,11.70185,82.155818,128.764706,142.0,6047.0,10286.0,0.9995,21231.0


In [14]:
df.drop(
    labels=[
        'id',
        'length_of_raw_text',
        'number_of_long_sentences',
        'max_section_length',
    ],
    axis=1,
).corr()

Unnamed: 0,number_of_sections,average_like,number_of_alerts,number_of_bold_elements,number_of_italics,number_of_images,number_of_snippets,number_of_code_elements,number_of_links,average_paragraph_length,...,number_of_tables,length_of_text,average_word_length,number_of_long_words,dale_chall_score,flesch_score,average_sentence_length,average_section_length,sentiment_score,likes_count
number_of_sections,1.0,-0.029367,0.198382,0.11241,0.183878,0.074647,0.347914,0.325652,0.113905,0.015561,...,0.077631,0.499703,-0.046607,0.227606,-0.039332,0.006956,-0.036228,-0.175286,0.086611,-0.0554
average_like,-0.029367,1.0,0.016021,-0.029252,-0.05363,-0.13909,0.099526,0.108714,0.037864,-0.027726,...,-0.040976,-0.139048,-0.01782,-0.052755,-0.12076,0.041029,-0.051539,-0.14335,-0.026911,-0.005638
number_of_alerts,0.198382,0.016021,1.0,0.088548,0.049993,-0.080718,0.171882,0.347896,0.042414,-0.171434,...,-0.014275,0.300296,0.062007,0.143824,-0.020401,-0.0012,0.014988,0.181037,0.021541,0.00111
number_of_bold_elements,0.11241,-0.029252,0.088548,1.0,0.128207,0.063529,-0.139286,-0.132052,0.051824,0.070365,...,0.164912,0.351362,0.104187,0.168117,0.166479,-0.029721,0.015619,0.127522,0.079841,0.092506
number_of_italics,0.183878,-0.05363,0.049993,0.128207,1.0,0.109801,-0.016436,-0.035748,0.111064,0.102513,...,0.245322,0.371028,-0.094155,0.202061,0.083442,0.012943,0.054428,0.144375,0.080212,-0.073368
number_of_images,0.074647,-0.13909,-0.080718,0.063529,0.109801,1.0,-0.210199,-0.134675,0.067726,-0.20927,...,-0.032582,0.167626,-0.191255,0.038748,-0.046833,0.11114,-0.011666,0.002539,0.068268,-0.07314
number_of_snippets,0.347914,0.099526,0.171882,-0.139286,-0.016436,-0.210199,1.0,0.701644,0.035601,-0.291074,...,-0.012402,0.120491,-0.114915,-0.024687,-0.188983,0.040584,-0.229884,0.1637,-0.009608,-0.037993
number_of_code_elements,0.325652,0.108714,0.347896,-0.132052,-0.035748,-0.134675,0.701644,1.0,0.012156,-0.132434,...,-0.068347,0.254416,-0.073155,0.02922,-0.222618,0.085576,-0.148889,0.294912,0.026147,-0.084771
number_of_links,0.113905,0.037864,0.042414,0.051824,0.111064,0.067726,0.035601,0.012156,1.0,0.060159,...,0.014809,0.243541,-0.004701,0.246947,0.064215,-0.002227,0.008034,0.058815,0.072151,0.074453
average_paragraph_length,0.015561,-0.027726,-0.171434,0.070365,0.102513,-0.20927,-0.291074,-0.132434,0.060159,1.0,...,-0.038086,0.336836,0.01718,0.201532,-0.002741,0.040734,0.293562,0.11243,0.091713,-0.069415


In [15]:
df[df.average_like <= 1]

Unnamed: 0,id,length_of_raw_text,number_of_sections,average_like,number_of_alerts,number_of_bold_elements,number_of_italics,number_of_images,number_of_snippets,number_of_code_elements,...,average_word_length,number_of_long_words,dale_chall_score,flesch_score,average_sentence_length,number_of_long_sentences,average_section_length,max_section_length,sentiment_score,likes_count
5,6012,6253,4,0.866667,0,4,3,1,2,20,...,5.289528,2,9.681925,52.064007,97.307692,38,2173.666667,4785,0.9658,15
8,6413,6360,5,0.25,0,3,1,7,0,0,...,5.383821,1,8.388631,68.868918,67.243243,69,2151.4,3386,0.9357,96
61,9875,6875,5,0.777778,3,1,0,2,0,0,...,5.379775,5,9.812419,46.672419,78.933333,44,2480.25,3718,0.9889,36
67,10251,5740,4,0.076923,0,1,0,4,0,0,...,5.346565,1,8.730937,52.696794,109.043478,44,2289.333333,2468,0.9872,13
210,9536,8333,12,0.931373,1,2,0,0,19,64,...,5.141049,3,8.933992,57.620961,73.677966,57,1004.272727,2912,0.9934,102
242,11670,8553,6,0.596639,0,4,0,15,21,62,...,5.037109,4,8.694335,56.732222,82.204082,44,2299.2,3050,0.9981,238
446,5745,3693,5,0.995283,1,15,2,3,0,14,...,5.879397,3,10.103042,50.04857,83.372093,41,1092.2,1587,-0.9889,212
479,4983,4637,5,0.936508,1,34,0,3,0,0,...,5.444867,5,8.532187,59.148341,87.957447,46,1424.2,4178,0.9181,63
534,10087,5071,4,0.791667,0,0,3,0,7,33,...,6.131765,16,10.7472,44.234117,77.645833,46,1978.666667,4017,0.7298,48
540,6472,6400,4,0.729282,2,2,12,0,4,29,...,5.047697,3,8.551332,69.61683,76.31746,56,2187.0,2305,0.3029,181


In [None]:
df.to_json('prepared.json')