# Text Feature Extraction Tool

This is a toy project for extracting linguistic features from the random text. This is a simplistic model, but it gives a lot of insights into the sample text.

In [1]:
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from text_classifier import TextClassifier
from urllib.request import Request, urlopen

import pandas as pd

In [2]:
def extract_features_from_text(sentences):
    """
    Extracts features from text from a given list

    :param
        sentences: a list of text to analyze
    :return:
        A data frame of features
    """
    features = {
        "adj_and_adv_frequency": [],
        "has_subordinate_words": [],
        "modal_frequency": [],
        "peculiar_words": [],
        "plural_usage": [],
        "text_reading_ease": [],
        'article_density': [],
        'preposition_density': [],
        'type_token_ratio': [],
    }

    for sentence in sentences:
        cls = TextClassifier(sentence)
        features["adj_and_adv_frequency"].append(cls.calculate_lexical_density_by_tags({"JJ", "RB"}))
        features["has_subordinate_words"].append(cls.has_peculiar_expression("\b(But|So|Because)"))
        features["modal_frequency"].append(cls.calculate_lexical_density_by_tags({'MD'}))
        features["peculiar_words"].append(cls.calculate_words_frequency({"good"}))
        features["plural_usage"].append(cls.calculate_lexical_density_by_tags({"NNS"}))
        features["text_reading_ease"].append(cls.calculate_sentence_reading_ease())
        features['article_density'].append(cls.calculate_words_frequency({"a", "an", "the"}))
        features['preposition_density'].append(cls.calculate_lexical_density_by_tags({"IN"}))
        features['type_token_ratio'].append(cls.calculate_type_token_ratio())

    return pd.DataFrame(features)

In [3]:
def extract_text_from_webpage(sites):
    """
    Scraps text from Nyasatimes sites

    :param
        sites: a list of webpages to scrap
    """
    news = []

    for site in sites:
        html = BeautifulSoup(urlopen(Request(site, headers={'User-Agent': 'Mozilla/5.0'})))
        paragraphs = []
        for p_tag in html.find_all("p"):
            if p_tag.parent["class"][0] == "nyasa-content":
                for element in p_tag.contents:
                    if isinstance(element, NavigableString):
                        paragraphs.append(element)
        news.append(" ".join(paragraphs))

    return list(filter(None, news))