In [1]:
import pandas as pd

credibility_scores = pd.read_excel("Data/webcredibility/web_credibility_1000_url_ratings.xls")

In [2]:
credibility_scores

Unnamed: 0,Topic,Query,Result Rank,URL,Likert Rating
0,Celebrities,adam lambert,1,http://en.wikipedia.org/wiki/Adam_Lambert,5
1,Celebrities,adam lambert,2,http://www.adamofficial.com/us/intro,4
2,Celebrities,adam lambert,3,http://www.adamofficial.com/us/home,4
3,Celebrities,adam lambert,4,http://www.thehollywoodgossip.com/2010/06/new-...,3
4,Celebrities,adam lambert,5,http://www.americanidol.com/contestants/season...,4
...,...,...,...,...,...
995,Politics,Tea Party,36,http://stlouisteaparty.com/,3
996,Politics,Tea Party,37,http://abcnews.go.com/Politics/tea-party-prote...,4
997,Politics,Tea Party,38,http://topics.politico.com/index.cfm/topic/Tea...,3
998,Politics,Tea Party,39,http://www.nationwidechicagoteaparty.com/,3


# Preprocessing

Get the content from the url and get the following statistical features

## Content Features
- #exclamations Number of exclamation marks ”!” in the text
- #commas Number of commas ”,” in the text
- #dots Number of dots ”.” in the text
- #questions Number of question marks ”?” in the text
- #token count Text length as the number of words
- ?polarity 0 if the page is negative, 1 if the page is positive
- #positive Number of positive sentences
- #negative Number of negative sentences
- #subjective Number of subjective sentences
- #objective Number of objective sentences
- #spelling errors Number of spelling errors
- @text complexity Text entropy
- @informativeness Uniqueness of the page’s content relative to other pages
- @smog Statistical measure of text readability
- category Web page category, e.g., Entertainment, Business, etc.
- #NN Number of nouns in the text
- #VB Number of verbs in the text
- #JJ Number of adjectives
- #RB Number of adverbs
- #DT Number of determiners
 
## Appearance Features
- #ad count Number of ads on the webpage
- #ad max size The area in pixels of the biggest ad
- #ad body ratio Ratio of the area of all ads to the area of the page
- #css definitions Number of webpage CSS style definitions

## Meta information
- domain_type eg .com, .org etc.

## Social popularity
- #fb share Number of Facebook shares for a webpage URL
- #fb like Number of Facebook likes for a webpage URL
- #fb comment Number of Facebook comments for a webpage URL
- #fb click Number of Facebook clicks for a webpage URL
- #fb total Total Facebook shares, likes, comments and clicks
- #tweets Number Tweets mentioning a webpage URL
- #bitly clicks Number of Bitly short URL clicks for a webpage
- #bitly referrers Number of web sites having Bitly short URL for a webpage
- #delicious bookmarks Number of Delicious bookmarks for a webpage URL
- @alexa_rank
- #alexa_linksin Number of web site linkings estimated by Alexa
- @page_rank

In [9]:
# Helper functions to extract features
import math
import urllib, sys
from newspaper import Article, ArticleException
from collections import Counter
from spellchecker import SpellChecker
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from textblob import TextBlob
import spacy
from readability import Readability
from readability.exceptions import ReadabilityException
from bs4 import BeautifulSoup as soup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
import tldextract
from dotenv import load_dotenv

load_dotenv()

# Get the content features
def get_article_content(url):
    article = Article(url)
    article.download()
    article.parse()
    return {
        "authors": article.authors,
        "content": article.text,
    }

def get_punctuations(content):
    counts = Counter(content)
    finders_list = "!,.?"
    required_counts = {k:v for k, v in counts.items() if k in finders_list}
    # get the required punctuations
    punctuation_features = {
        "exclamations": required_counts.get("!", 0),
        "commas": required_counts.get(",", 0),
        "dots": required_counts.get(".", 0),
        "questions": required_counts.get("?", 0)
    }
    return punctuation_features

def get_sentences(content):
    return list(
        filter(
            lambda s: s != "",
            list(
                map(lambda s: s.strip(), content.split("\n"))
            )
        )
    )

def get_readability(content):
    r = Readability(content)
    try:
        return r.smog().score
    except ReadabilityException:
        return None

def get_word_related_stats(sentences):
    nlp = spacy.load("en_core_web_sm")
    content = ". ".join(sentences)
    doc = nlp(content)
    num_words = len(doc)
    c = Counter(([token.pos_ for token in doc]))
    num_verbs = c.get('VERB')
    num_nouns = c.get('NOUN')
    num_adverbs = c.get('ADP')
    num_determiners = c.get('DET')
    all_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    spelling_check_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and not token.pos_ == "PROPN"]
    all_words_counter = Counter(all_words)
    entropy = 0
    n = len(all_words)
    for word_count in all_words_counter:
        fi = all_words_counter.get(word_count)
        entropy += fi * (math.log10(n) - math.log10(fi))
    return {
        'words': list(set(all_words)),
        'spell_check': list(set(spelling_check_words)),
        'num_words': num_words,
        'num_nouns': num_nouns,
        'num_verbs': num_verbs,
        'num_adverbs': num_adverbs,
        'num_determiners': num_determiners,
        'text_entropy': entropy
    }

def get_spelling_errors(words):
    checker = SpellChecker()
    return checker.unknown(words)

def get_sentiments_and_subjectivity(sentences):
    dataset_name = 'imdb'
    saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))
    reloaded_model = tf.saved_model.load(saved_model_path)
    sentiments = tf.sigmoid(reloaded_model(tf.constant(sentences)))
    sentiments = list(map(lambda sentiment: round(sentiment), sentiments.numpy().flatten().tolist()))
    subjectivity = [round(TextBlob(i).sentiment.subjectivity) for i in sentences]
    
    sentiment_counter = Counter(sentiments)
    num_positive_sentences = sentiment_counter.get(1)
    num_negative_sentences = sentiment_counter.get(0)
    
    subjectivity_counter = Counter(subjectivity)
    num_subjective_sentences = sentiment_counter.get(1)
    num_objective_sentences = sentiment_counter.get(0)
    return {
        "polarity": sentiment_counter.most_common(1)[0][0],
        "num_positive": num_positive_sentences,
        "num_negative": num_negative_sentences,
        "num_subjective": num_subjective_sentences,
        "num_objective": num_objective_sentences
    }

def get_content_features(url):
    article = get_article_content(url)
    content = article.get('content')
    punctuations = get_punctuations(content)
    sentences = get_sentences(content)
    sentiment_dic = get_sentiments_and_subjectivity(sentences)
    words_metrics = get_word_related_stats(sentences)
    spelling_errors = get_spelling_errors(words_metrics.get('words'))
    smog = get_readability(content)
    
    return {
        "content": content,
        "punctuations": punctuations,
        "sentences": sentences,
        "sentiment_dic": sentiment_dic,
        "words_metrics": words_metrics,
        "spelling_errors": spelling_errors,
        "smog": smog
    }


# Gettint the appearance features
def get_ads_blacklist(file_path):
    with open(file_path) as file:
        return file.read().splitlines()

def get_ads(url):
    add_urls = get_ads_blacklist("add_urls.txt")
    
    # Find all the iframes that have add_urls as source
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.headless = True
    browser = webdriver.Chrome("./chromedriver", options=chrome_options)
    browser.set_page_load_timeout(30)
    try:
        browser.get(url)
    except TimeoutException:
        return {
            "ads": [],
            "num_ads": 0
        }
    parsed_html = soup(browser.page_source,"html.parser")
    
    num_ads = 0
    ads = []
    for iframe in parsed_html.findAll("iframe"):
        for url in add_urls:
            if (iframe.get("src") != None and url in iframe.get("src")):
                num_ads += 1
                ads.append(iframe)
        if (iframe.get("name") != None and "google_ads" in iframe.get("name")):
            num_ads += 1
            ads.append(iframe)
    return {
        "ads": ads,
        "num_ads": num_ads
    }

# Meta info
def get_domain_type(url):
    return tldextract.extract(url).suffix

# Get social popularity
def get_social_info(url):
    response = requests.get(f"http://count-server.sharethis.com/v2.0/get_counts?url={url}")
    info = response.json()
    return info.get("shares")

# General popularity
def get_alexa_rank(url):
    return soup(requests.get(f"http://data.alexa.com/data?cli=10&dat=s&url={url}").content, "xml").find("REACH")['RANK']

# A master function to get all required features for a url
def get_features(url):
    try:
        print(f"Getting content features for {url}")
        content_features = get_content_features(url)
        print(f"Getting ads for {url}")
        ads = get_ads(url)
        print(f"Getting domain type for {url}")
        domain_type = get_domain_type(url)
        print(f"Getting social information for {url}")
        social_information = get_social_info(url)
        print(f"Getting alexa rank for {url}")
        alexa_rank = get_alexa_rank(url)
        print(f"Getting smog score for {url}")
        smog_score = get_readability(content_features.get("content"))

        return {
            "exclamations": content_features.get("punctuations").get("exclamations"),
            "commas": content_features.get("punctuations").get("commas"),
            "dots": content_features.get("punctuations").get("dots"),
            "questions": content_features.get("punctuations").get("questions"),
            "token_count": content_features.get("words_metrics").get("num_words"),
            "polarity": content_features.get("sentiment_dic").get("polarity"),
            "positive": content_features.get("sentiment_dic").get("num_positive"),
            "negative": content_features.get("sentiment_dic").get("num_negative"),
            "subjective": content_features.get("sentiment_dic").get("num_subjective"),
            "objective": content_features.get("sentiment_dic").get("num_objective"),
            "spelling_errors": len(content_features.get("spelling_errors")),
            "text_complexity": content_features.get("words_metrics").get("text_entropy"),
            "smog": smog_score,
            "nouns": content_features.get("words_metrics").get("num_nouns"),
            "verbs": content_features.get("words_metrics").get("num_verbs"),
            "adverbs": content_features.get("words_metrics").get("num_adverbs"),
            "determiners": content_features.get("words_metrics").get("num_determiners"),
            "ad_count": ads.get("num_ads"),
            "domain_type": domain_type,
            "fb_shares": social_information.get("facebook"),
            "fb_like": social_information.get("fb_like"),
            "linkedin_shares": social_information.get("linkedin"),
            "alexa_rank": alexa_rank
        }
    except ArticleException:
        return None

In [10]:
url = credibility_scores.iloc[0]["URL"]
features = get_features(url)

Getting content features for http://en.wikipedia.org/wiki/Adam_Lambert
Getting ads for http://en.wikipedia.org/wiki/Adam_Lambert
Getting domain type for http://en.wikipedia.org/wiki/Adam_Lambert
Getting social information for http://en.wikipedia.org/wiki/Adam_Lambert
Getting alexa rank for http://en.wikipedia.org/wiki/Adam_Lambert
Getting smog score for http://en.wikipedia.org/wiki/Adam_Lambert


In [11]:
features

{'exclamations': 7,
 'commas': 675,
 'dots': 439,
 'questions': 0,
 'token_count': 12093,
 'polarity': 1,
 'positive': 137,
 'negative': 2,
 'subjective': 137,
 'objective': 2,
 'spelling_errors': 107,
 'text_complexity': 15330.82322081936,
 'smog': 14.265292616868656,
 'nouns': 1866,
 'verbs': 1050,
 'adverbs': 1501,
 'determiners': 961,
 'ad_count': 0,
 'domain_type': 'org',
 'fb_shares': None,
 'fb_like': None,
 'linkedin_shares': None,
 'alexa_rank': '12'}

# Create a new dataframe considering all the above features along with the given score for the url


In [None]:
# Run only when generating the data frame for the first time, cuz this takes a lot of time 

import warnings

warnings.filterwarnings("ignore")

data = {}

from itertools import chain

for row in range(len(credibility_scores)):
    try:
        url = credibility_scores.iloc[row]["URL"]
        score = credibility_scores.iloc[row]["Likert Rating"]
        score_as_dic = {"rating": score}
        features = get_features(url)
        if features == None:
            continue
        data[row] = dict(chain.from_iterable(d.items() for d in (features, score_as_dic)))
    except:
        continue

In [None]:
features_dframe = pd.DataFrame(data)
features_dframe = features_dframe.transpose()

In [None]:
features_dframe

In [None]:
# Save the generated dataframe, would proably need to employ a new method for getting facebook likes, shareas and linkedin shares
features_dframe.to_csv("Data/extracted_features.csv")

In [5]:
dframe = pd.read_csv("Data/extracted_features.csv")

In [20]:
dframe.rename(columns = {'Unnamed: 0': 'original_index'}, inplace=True)

In [22]:
dframe

Unnamed: 0,original_index,exclamations,commas,dots,questions,token_count,polarity,positive,negative,subjective,...,verbs,adverbs,determiners,ad_count,domain_type,fb_shares,fb_like,linkedin_shares,alexa_rank,rating
0,0,7,675,439,0,12093,1,137.0,2.0,137.0,...,1050.0,1501.0,961.0,0,org,,,,12,5
1,3,1,11,6,1,178,1,5.0,2.0,5.0,...,20.0,18.0,8.0,0,com,,,,70219,3
2,6,0,0,3,1,30,0,,2.0,,...,7.0,2.0,1.0,0,com,,,,1668,4
3,7,2,3,1,0,57,1,2.0,,2.0,...,4.0,8.0,3.0,0,com,,,,70219,3
4,8,0,0,3,1,30,0,,2.0,,...,7.0,2.0,1.0,0,com,,,,1668,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,983,0,4,4,0,53,0,,1.0,,...,7.0,5.0,2.0,5,com,,,,160808,3
480,987,0,57,58,0,1290,1,4.0,2.0,4.0,...,149.0,151.0,183.0,14,com,,,,326984,4
481,989,0,13,10,0,249,1,7.0,2.0,7.0,...,22.0,28.0,23.0,3,com,,,,10093,4
482,991,0,16,31,1,596,0,6.0,9.0,6.0,...,87.0,58.0,49.0,3,com,,,,22506,3


In [25]:
indices = dframe.original_index.values.tolist()

In [27]:
dframe.columns

Index(['original_index', 'exclamations', 'commas', 'dots', 'questions',
       'token_count', 'polarity', 'positive', 'negative', 'subjective',
       'objective', 'spelling_errors', 'text_complexity', 'smog', 'nouns',
       'verbs', 'adverbs', 'determiners', 'ad_count', 'domain_type',
       'fb_shares', 'fb_like', 'linkedin_shares', 'alexa_rank', 'rating'],
      dtype='object')

In [31]:
# Only there cuz of an error i made during original calculation, skip if you are building for first time
for indice in indices:
    url = credibility_scores.iloc[indice]["URL"]
    try:
        dframe.loc[dframe['original_index'] == indice, 'smog'] = get_readability(get_article_content(url).get('content'))
    except:
        continue



In [36]:
dframe.fillna(0)

Unnamed: 0,original_index,exclamations,commas,dots,questions,token_count,polarity,positive,negative,subjective,...,verbs,adverbs,determiners,ad_count,domain_type,fb_shares,fb_like,linkedin_shares,alexa_rank,rating
0,0,7,675,439,0,12093,1,137.0,2.0,137.0,...,1050.0,1501.0,961.0,0,org,0.0,0.0,0.0,12,5
1,3,1,11,6,1,178,1,5.0,2.0,5.0,...,20.0,18.0,8.0,0,com,0.0,0.0,0.0,70219,3
2,6,0,0,3,1,30,0,0.0,2.0,0.0,...,7.0,2.0,1.0,0,com,0.0,0.0,0.0,1668,4
3,7,2,3,1,0,57,1,2.0,0.0,2.0,...,4.0,8.0,3.0,0,com,0.0,0.0,0.0,70219,3
4,8,0,0,3,1,30,0,0.0,2.0,0.0,...,7.0,2.0,1.0,0,com,0.0,0.0,0.0,1668,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,983,0,4,4,0,53,0,0.0,1.0,0.0,...,7.0,5.0,2.0,5,com,0.0,0.0,0.0,160808,3
480,987,0,57,58,0,1290,1,4.0,2.0,4.0,...,149.0,151.0,183.0,14,com,0.0,0.0,0.0,326984,4
481,989,0,13,10,0,249,1,7.0,2.0,7.0,...,22.0,28.0,23.0,3,com,0.0,0.0,0.0,10093,4
482,991,0,16,31,1,596,0,6.0,9.0,6.0,...,87.0,58.0,49.0,3,com,0.0,0.0,0.0,22506,3


In [38]:
dframe.describe()

Unnamed: 0,original_index,exclamations,commas,dots,questions,token_count,polarity,positive,negative,subjective,...,nouns,verbs,adverbs,determiners,ad_count,fb_shares,fb_like,linkedin_shares,alexa_rank,rating
count,484.0,484.0,484.0,484.0,484.0,484.0,484.0,440.0,366.0,440.0,...,482.0,477.0,481.0,478.0,484.0,13.0,0.0,2.0,484.0,484.0
mean,510.342975,0.667355,55.52686,52.210744,2.237603,1205.210744,0.733471,16.340909,8.800546,16.340909,...,239.854772,127.419287,121.293139,103.598326,1.78719,1.923077,,8.0,495702.7,3.679752
std,294.92479,4.59666,133.15685,111.908482,9.130713,2639.479357,0.442601,36.837229,14.296128,36.837229,...,523.378458,274.435678,278.092384,226.571275,3.46562,2.253203,,0.0,1512452.0,1.050842
min,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,,8.0,11.0,1.0
25%,245.5,0.0,2.0,3.0,0.0,54.5,0.0,2.0,1.0,2.0,...,11.0,8.0,5.0,4.0,0.0,1.0,,8.0,480.0,3.0
50%,506.0,0.0,16.0,16.0,0.0,376.0,1.0,5.0,4.0,5.0,...,70.5,37.0,35.0,32.5,0.0,1.0,,8.0,9506.0,4.0
75%,769.25,0.0,46.25,51.0,1.0,1152.0,1.0,15.0,9.0,15.0,...,229.0,121.0,113.0,99.75,2.0,1.0,,8.0,108159.0,4.0
max,996.0,88.0,1255.0,970.0,147.0,20590.0,1.0,423.0,111.0,423.0,...,3977.0,2689.0,2212.0,1703.0,21.0,7.0,,8.0,9603127.0,5.0


In [39]:
# Encode the categorical data
dframe.dtypes

original_index       int64
exclamations         int64
commas               int64
dots                 int64
questions            int64
token_count          int64
polarity             int64
positive           float64
negative           float64
subjective         float64
objective          float64
spelling_errors      int64
text_complexity    float64
smog               float64
nouns              float64
verbs              float64
adverbs            float64
determiners        float64
ad_count             int64
domain_type         object
fb_shares          float64
fb_like            float64
linkedin_shares    float64
alexa_rank           int64
rating               int64
dtype: object

In [41]:
dframe["domain_type"] = dframe["domain_type"].astype('category')

In [46]:
dframe["domain_type_cat"] = dframe["domain_type"].cat.codes

In [50]:
dframe

Unnamed: 0,original_index,exclamations,commas,dots,questions,token_count,polarity,positive,negative,subjective,...,adverbs,determiners,ad_count,domain_type,fb_shares,fb_like,linkedin_shares,alexa_rank,rating,domain_type_cat
0,0,7,675,439,0,12093,1,137.0,2.0,137.0,...,1501.0,961.0,0,org,0.0,0.0,0.0,12,5,9
1,3,1,11,6,1,178,1,5.0,2.0,5.0,...,18.0,8.0,0,com,0.0,0.0,0.0,70219,3,3
2,6,0,0,3,1,30,0,0.0,2.0,0.0,...,2.0,1.0,0,com,0.0,0.0,0.0,1668,4,3
3,7,2,3,1,0,57,1,2.0,0.0,2.0,...,8.0,3.0,0,com,0.0,0.0,0.0,70219,3,3
4,8,0,0,3,1,30,0,0.0,2.0,0.0,...,2.0,1.0,0,com,0.0,0.0,0.0,1668,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,983,0,4,4,0,53,0,0.0,1.0,0.0,...,5.0,2.0,5,com,0.0,0.0,0.0,160808,3,3
480,987,0,57,58,0,1290,1,4.0,2.0,4.0,...,151.0,183.0,14,com,0.0,0.0,0.0,326984,4,3
481,989,0,13,10,0,249,1,7.0,2.0,7.0,...,28.0,23.0,3,com,0.0,0.0,0.0,10093,4,3
482,991,0,16,31,1,596,0,6.0,9.0,6.0,...,58.0,49.0,3,com,0.0,0.0,0.0,22506,3,3


# Training SVM classifier for estimating the ratings

In [51]:
train_dframe = dframe.drop(['original_index', 'domain_type'], axis=1)

In [52]:
train_dframe

Unnamed: 0,exclamations,commas,dots,questions,token_count,polarity,positive,negative,subjective,objective,...,verbs,adverbs,determiners,ad_count,fb_shares,fb_like,linkedin_shares,alexa_rank,rating,domain_type_cat
0,7,675,439,0,12093,1,137.0,2.0,137.0,2.0,...,1050.0,1501.0,961.0,0,0.0,0.0,0.0,12,5,9
1,1,11,6,1,178,1,5.0,2.0,5.0,2.0,...,20.0,18.0,8.0,0,0.0,0.0,0.0,70219,3,3
2,0,0,3,1,30,0,0.0,2.0,0.0,2.0,...,7.0,2.0,1.0,0,0.0,0.0,0.0,1668,4,3
3,2,3,1,0,57,1,2.0,0.0,2.0,0.0,...,4.0,8.0,3.0,0,0.0,0.0,0.0,70219,3,3
4,0,0,3,1,30,0,0.0,2.0,0.0,2.0,...,7.0,2.0,1.0,0,0.0,0.0,0.0,1668,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,0,4,4,0,53,0,0.0,1.0,0.0,1.0,...,7.0,5.0,2.0,5,0.0,0.0,0.0,160808,3,3
480,0,57,58,0,1290,1,4.0,2.0,4.0,2.0,...,149.0,151.0,183.0,14,0.0,0.0,0.0,326984,4,3
481,0,13,10,0,249,1,7.0,2.0,7.0,2.0,...,22.0,28.0,23.0,3,0.0,0.0,0.0,10093,4,3
482,0,16,31,1,596,0,6.0,9.0,6.0,9.0,...,87.0,58.0,49.0,3,0.0,0.0,0.0,22506,3,3


In [53]:
# Let us do a bit of feature engineering
# Ignore all warnings
import warnings
warnings.filterwarnings('ignore')

# Import helper libraries
from sklearn.externals.joblib import dump, load

# Import libraries
    
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# The models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from scipy import stats

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

ModuleNotFoundError: No module named 'sklearn.externals.joblib'