In [1]:
import pandas as pd

credibility_scores = pd.read_excel("Data/webcredibility/web_credibility_1000_url_ratings.xls")

In [2]:
credibility_scores

Unnamed: 0,Topic,Query,Result Rank,URL,Likert Rating
0,Celebrities,adam lambert,1,http://en.wikipedia.org/wiki/Adam_Lambert,5
1,Celebrities,adam lambert,2,http://www.adamofficial.com/us/intro,4
2,Celebrities,adam lambert,3,http://www.adamofficial.com/us/home,4
3,Celebrities,adam lambert,4,http://www.thehollywoodgossip.com/2010/06/new-...,3
4,Celebrities,adam lambert,5,http://www.americanidol.com/contestants/season...,4
...,...,...,...,...,...
995,Politics,Tea Party,36,http://stlouisteaparty.com/,3
996,Politics,Tea Party,37,http://abcnews.go.com/Politics/tea-party-prote...,4
997,Politics,Tea Party,38,http://topics.politico.com/index.cfm/topic/Tea...,3
998,Politics,Tea Party,39,http://www.nationwidechicagoteaparty.com/,3


# Preprocessing

Get the content from the url and get the following statistical features

## Content Features
- #exclamations Number of exclamation marks ”!” in the text
- #commas Number of commas ”,” in the text
- #dots Number of dots ”.” in the text
- #questions Number of question marks ”?” in the text
- #token count Text length as the number of words
- ?polarity 0 if the page is negative, 1 if the page is positive
- #positive Number of positive sentences
- #negative Number of negative sentences
- #subjective Number of subjective sentences
- #objective Number of objective sentences
- #spelling errors Number of spelling errors
- @text complexity Text entropy
- @informativeness Uniqueness of the page’s content relative to other pages
- @smog Statistical measure of text readability
- category Web page category, e.g., Entertainment, Business, etc.
- #NN Number of nouns in the text
- #VB Number of verbs in the text
- #JJ Number of adjectives
- #RB Number of adverbs
- #DT Number of determiners
 
## Appearance Features
- #ad count Number of ads on the webpage
- #ad max size The area in pixels of the biggest ad
- #ad body ratio Ratio of the area of all ads to the area of the page
- #css definitions Number of webpage CSS style definitions

## Meta information
- domain_type eg .com, .org etc.

## Social popularity
- #fb share Number of Facebook shares for a webpage URL
- #fb like Number of Facebook likes for a webpage URL
- #fb comment Number of Facebook comments for a webpage URL
- #fb click Number of Facebook clicks for a webpage URL
- #fb total Total Facebook shares, likes, comments and clicks
- #tweets Number Tweets mentioning a webpage URL
- #bitly clicks Number of Bitly short URL clicks for a webpage
- #bitly referrers Number of web sites having Bitly short URL for a webpage
- #delicious bookmarks Number of Delicious bookmarks for a webpage URL
- @alexa_rank
- #alexa_linksin Number of web site linkings estimated by Alexa
- @page_rank

In [74]:
# Helper functions to extract features
import math
from newspaper import Article
from collections import Counter
from spellchecker import SpellChecker
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from textblob import TextBlob
import spacy

def get_article_content(url):
    article = Article(url)
    article.download()
    article.parse()
    return {
        "authors": article.authors,
        "content": article.text,
    }

def get_punctuations(content):
    counts = Counter(content)
    finders_list = "!,.?"
    required_counts = {k:v for k, v in counts.items() if k in finders_list}
    # get the required punctuations
    punctuation_features = {
        "exclamations": required_counts.get("!", 0),
        "commas": required_counts.get(",", 0),
        "dots": required_counts.get(".", 0),
        "questions": required_counts.get("?", 0)
    }
    return punctuation_features

def get_sentences(content):
    return list(
        filter(
            lambda s: s != "",
            list(
                map(lambda s: s.strip(), content.split("\n"))
            )
        )
    )

def get_word_related_stats(sentences):
    nlp = spacy.load("en_core_web_sm")
    content = ". ".join(sentences)
    doc = nlp(content)
    num_words = len(doc)
    c = Counter(([token.pos_ for token in doc]))
    num_verbs = c.get('VERB')
    num_nouns = c.get('NOUN')
    num_adverbs = c.get('ADP')
    num_determiners = c.get('DET')
    all_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    spelling_check_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and not token.pos_ == "PROPN"]
    all_words_counter = Counter(all_words)
    entropy = 0
    n = len(all_words)
    for word_count in all_words_counter:
        fi = all_words_counter.get(word_count)
        entropy += fi * (math.log10(n) - math.log10(fi))
    return {
        'words': list(set(all_words)),
        'spell_check': list(set(spelling_check_words)),
        'num_words': num_words,
        'num_verbs': num_verbs,
        'num_adverbs': num_adverbs,
        'num_determiners': num_determiners,
        'text_entropy': entropy
    }

def get_spelling_errors(words):
    checker = SpellChecker()
    return checker.unknown(words)

def get_sentiments_and_subjectivity(sentences):
    dataset_name = 'imdb'
    saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))
    reloaded_model = tf.saved_model.load(saved_model_path)
    sentiments = tf.sigmoid(reloaded_model(tf.constant(sentences)))
    sentiments = list(map(lambda sentiment: round(sentiment), sentiments.numpy().flatten().tolist()))
    subjectivity = [round(TextBlob(i).sentiment.subjectivity) for i in sentences]
    return [sentiments, subjectivity]

In [75]:
article = get_article_content(credibility_scores.iloc[0]["URL"])
content = article.get('content')
punctuations = get_punctuations(content)
sentences = get_sentences(content)
sentiments, subjectivity = get_sentiments_and_subjectivity(sentences)
words_metrics = get_word_related_stats(sentences)
spelling_errors = get_spelling_errors(words_metrics.get('words'))



In [80]:
words_metrics.get("text_entropy")

15330.82322081936