# Text Feature Extraction Tool

This is a toy project for extracting linguistic features from the random text. This is a simplistic model, but it gives a lot of insights into the sample text.

In [1]:
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from text_classifier import TextClassifier
from urllib.request import Request, urlopen

import pandas as pd

In [2]:
def extract_features_from_text(sentences):
    """
    Extracts features from text from a given list

    :param
        sentences: a list of text to analyze
    :return:
        A data frame of features
    """
    features = {
        "adj_and_adv_frequency": [],
        "has_subordinate_words": [],
        "modal_frequency": [],
        "peculiar_words": [],
        "plural_usage": [],
        "text_reading_ease": [],
        'article_density': [],
        'preposition_density': [],
        'type_token_ratio': [],
    }

    for sentence in sentences:
        cls = TextClassifier(sentence)
        features["adj_and_adv_frequency"].append(cls.calculate_lexical_density_by_tags({"JJ", "RB"}))
        features["has_subordinate_words"].append(cls.has_peculiar_expression("\b(But|So|Because)"))
        features["modal_frequency"].append(cls.calculate_lexical_density_by_tags({'MD'}))
        features["peculiar_words"].append(cls.calculate_words_frequency({"good"}))
        features["plural_usage"].append(cls.calculate_lexical_density_by_tags({"NNS"}))
        features["text_reading_ease"].append(cls.calculate_sentence_reading_ease())
        features['article_density'].append(cls.calculate_words_frequency({"a", "an", "the"}))
        features['preposition_density'].append(cls.calculate_lexical_density_by_tags({"IN"}))
        features['type_token_ratio'].append(cls.calculate_type_token_ratio())

    return pd.DataFrame(features)

In [3]:
def extract_text_from_webpage(sites):
    """
    Scraps text from Nyasatimes sites

    :param
        sites: a list of webpages to scrap
    """
    news = []

    for site in sites:
        html = BeautifulSoup(urlopen(Request(site, headers={'User-Agent': 'Mozilla/5.0'})))
        paragraphs = []
        for p_tag in html.find_all("p"):
            if p_tag.parent["class"][0] == "nyasa-content":
                for element in p_tag.contents:
                    if isinstance(element, NavigableString):
                        paragraphs.append(element)
        news.append(" ".join(paragraphs))

    return list(filter(None, news))

In [4]:
sites = [
    "https://www.nyasatimes.com/escom-angers-parley-for-resisting-single-buyer-model/",
    "https://www.nyasatimes.com/govt-auctions-cashgaters-property-to-recover-k1-4bn-from-the-sale/",
    "https://www.nyasatimes.com/major-cabinet-shake-up-as-mlusu-msungama-rashy-gaffar-are-dropped-mlusu-replaced-by-gwengwe-as-finance-minister/",
    "https://www.nyasatimes.com/chakwera-earns-praise-over-his-gender-sensitive-cabinet/",
    "https://www.nyasatimes.com/undule-urges-chakwera-to-strip-chilima-of-economic-planning-portfolio/",
    "https://www.nyasatimes.com/mera-set-to-adjust-pump-fuel-prices-as-cama-demands-review-review-of-many-levies-on-fuel-prices/",
    "https://www.nyasatimes.com/flames-charm-the-world-after-afcon-sensation/",
    "https://www.nyasatimes.com/fdh-group-attracts-a-heap-of-praise-for-supporting-the-incredible-flames/",
    "https://www.nyasatimes.com/against-morocco-flames-have-nothing-to-lose-and-everything-to-win/",
    "https://www.nyasatimes.com/fam-attributes-flames-historic-afcon-performance-to-fdh-banks-invaluable-investment-as-its-official-sponsor/",
]

extract_features_from_text(extract_text_from_webpage(sites))

Unnamed: 0,adj_and_adv_frequency,has_subordinate_words,modal_frequency,peculiar_words,plural_usage,text_reading_ease,article_density,preposition_density,type_token_ratio
0,8.309,False,1.335,0.0,4.154,960.736065,77.151,11.424,0.397626
1,6.436,False,0.0,0.0,3.96,469.980548,59.406,10.891,0.584158
2,7.323,False,0.12,0.0,2.401,504.849678,34.814,11.525,0.411765
3,7.71,False,1.247,1.134,3.628,578.929285,78.231,12.132,0.39229
4,8.617,False,1.474,2.268,4.762,695.269596,86.168,12.698,0.38322
5,8.706,False,0.995,0.0,6.799,495.088064,71.31,12.023,0.33665
6,9.839,False,0.968,0.0,2.419,1858.296737,83.871,10.484,0.464516
7,8.186,False,0.279,0.93,3.628,913.331571,88.372,12.651,0.355349
8,9.332,False,1.06,1.06,3.924,2410.527155,62.566,12.301,0.373277
9,10.909,False,0.364,0.0,3.636,1168.025019,80.0,13.818,0.412727
