This code is based on the work presented in paper, Toubia, O., Berger, J.A., & Eliashberg, J. (2021). How quantifying the shape of stories predicts their success. Proceedings of the National Academy of Sciences of the United States of America, 118.

They propose quantifying the text by calculating their speed, volume and circuitousness

speed- sum of distance between consecutive points or chunks of text divided by length of text

volume- Volume of the the minimum volume ellipsoid that covers all points

circuitousness- ratio of actual distance travelled between text points to the shortest path


the three quantities are calculated using python and nlp libraries and tested on datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir ~/.kaggle/
!mv kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle/’: File exists


In [None]:
!chmod 600 /root/.kaggle
!kaggle datasets download -d prajwalkanade/sentiment-analysis-word-lists-dataset

Downloading sentiment-analysis-word-lists-dataset.zip to /content
  0% 0.00/21.7k [00:00<?, ?B/s]
100% 21.7k/21.7k [00:00<00:00, 38.2MB/s]


In [None]:
!kaggle datasets download -d adarshsng/googlenewsvectors

Downloading googlenewsvectors.zip to /content
100% 1.63G/1.64G [00:19<00:00, 123MB/s]
100% 1.64G/1.64G [00:19<00:00, 92.2MB/s]


In [None]:
with ZipFile("/content/sentiment-analysis-word-lists-dataset.zip", "r") as f:
  f.extractall("/content")

In [None]:
with ZipFile("/content/googlenewsvectors.zip", "r") as f:
  f.extractall("/content")

In [None]:
import numpy as np
import pandas as pd
import nltk
from zipfile import ZipFile
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
lemma = WordNetLemmatizer()
stop_words = stopwords.words('english')
import string

my_punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
           '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
           '`', '{', '|', '}', '~', '»', '«', '“', '”', '..', '...', '),', ').', "'."]

#convert text into tokens and convert into lower case
def tokenize(text):
    tokens=wordpunct_tokenize(text)
    tokens=[w.lower() for w in tokens]
    tokens=[token for token in tokens if len(token)>1 and token not in my_punct]
    return tokens

#preprocess data
def preprocess_text(tokens):
    processed_tokens=[t for t in tokens if t not in stop_words]
    lemmatized=[lemma.lemmatize(t) for t in processed_tokens]
    return lemmatized

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
with open("positive-words.txt", 'r') as f:
    positive_words=f.readlines()
positive_words=[w.strip("\n") for w in positive_words]


with open("negative-words.txt", encoding = "ISO-8859-1") as f:
    negative_words=f.readlines()
negative_words=[w.strip("\n") for w in negative_words]

def pos_count(words):
    return len([i for i in words if i in positive_words])

def neg_count(words):
    return len([i for i in words if i in negative_words])

def sentiment_count(words):
    if pos_count(words)==0 and neg_count(words)==0:
        return 0
    else:
        pos=pos_count(words)
        neg=neg_count(words)
        return (pos-neg)/len(words)


In [None]:
#import Google News word2vec model and use it to find the vectors of words in 300 dimensional
#vector space
from gensim.models import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format('/content/GoogleNews-vectors-negative300.bin', binary=True)

def convert_to_vectors(tokens):
    vectors=[]
    for x in tokens:
        if x.isspace():
            x.strip()
        if x in w2v_model:
            vectors.append(w2v_model[x])
        else:
            continue     #if token not present in w2v vocab, continue

    return vectors


In [None]:
def break_into_windows(document, window_size=250):  #break text into windows of approximately same size
#tokenise into sentences and add to windows
    sentences=nltk.sent_tokenize(document)
    size=len(tokenize(document))
    if 50<size & size<=100:
        window_size=10
    elif 100<size & size<=250:
        window_size=16
    elif 250<size & size<=500:
        window_size=50
    else: pass


    window_len=0  #initialise window length
    windows=[]
    current_window=[]

    for sentence in sentences:
        sent_len=len(sentence.split())

        if window_len+sent_len>window_size:
            windows.append(". ".join(current_window))
            current_window=[]
            window_len=0


        current_window.append(sentence)
        window_len+=sent_len

    if current_window:
        windows.append(". ".join(current_window))

    return windows


In [None]:
# calculate average vector from vectors of text windows
def avg_text_window(vectors):
    if len(vectors)==0:
        return [0]*300

    vec_array=np.array(vectors)
    avg_vector=np.sum(vec_array, axis=0)
    avg_vector_=avg_vector/len(vectors)
    return avg_vector_

In [None]:
#to calculate distance between two chunks by calculating distance between their average vectors
def distance(x1, x2):
    return np.linalg.norm(np.subtract(x1,x2))

#calculating pace or speed of text.
#speed is calculated as sum of distance between consecutive points or chunks of text divided by length of text

#consecutive chunks which are farther away are more likely to cover different themes and topics

def speed(avg_vectors):
    T=len(avg_vectors)
    dist=0
    for i in range(1,T):
        dist+=distance(avg_vectors[i],avg_vectors[i-1])
    if (T-1)==0:
        return 0
    speed= float(dist)/(T-1)
    return speed

In [None]:
import math
import scipy
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

max_value=1.0e+5

#measuring volume is done to analyse how much ground is covered by text, i.e,
#different themes covered
#volume is calculated by finding the minimum volume ellipsoid that covers all points
def calculate_volume(avg_vectors):
    geo_mean=1
    P=np.array(avg_vectors, dtype='float')
    if len(avg_vectors)==301:
        #create a Convex Hull Object
        convex_str=scipy.spatial.ConvexHull(P, qhull_options='QJ')
        #find centroid of convex Hull
        centroid=np.mean(P[convex_str.vertices])
        #Cavraince matrix of the convex hull
        covariance_matrix=np.cov(P[convex_str.vertices].T)
        w,v=np.linalg.eig(covariance_matrix)

        dimension=len(w)
        volume=dimension*math.pi
        for x in w:
            if np.isclose(x,0):
                continue
            volume*=1/np.sqrt(abs(x))

            geo_mean*=x**(1/dimension)

#volume is normalised by its dimensionality
        normalised_volume=volume/geo_mean

    else:
        pca=PCA(n_components=P.shape[0]-1)
        reduced_P=pca.fit_transform(P)
        #create a Convex Hull Object
        convex_str=scipy.spatial.ConvexHull(reduced_P, qhull_options='QJ')
        #find centroid of convex Hull
        centroid=np.mean(reduced_P[convex_str.vertices])
        #Cavraince matrix of the convex hull
        covariance_matrix=np.cov(reduced_P[convex_str.vertices].T)
        #eigen values and eigen vectors of covariance matrix
        w,v=np.linalg.eig(covariance_matrix)
        dimension=len(w)
        volume=dimension*math.pi
        for x in w:
            if np.isclose(x,0):
                continue
            volume*=1/np.sqrt(abs(x))
            geo_mean*=x**(1/dimension)

#         volume is normalised by its dimensionality
        normalised_volume=volume*geo_mean


    return normalised_volume


#covering a lot of ground allows audience to connect and see a wide range of topics but increases
#cognitive burden

In [None]:
import itertools

#creating an adjacency matrix

def create_adj_matrix(avg_vectors):
    adj_matrix=np.zeros((len(avg_vectors), len(avg_vectors)))
    for i, j in itertools.combinations(range(len(avg_vectors)), 2):
        adj_matrix[i][j]=distance(avg_vectors[i], avg_vectors[j])
        adj_matrix[j][i]=adj_matrix[i][j]

    return adj_matrix



In [None]:
#find shortest path between all vectors with minimal spanning tree
def prim_alg(matrix):
    N=matrix.shape[0]         #number of nodes
    nodes=[0]*N
    nodes[0]=True  #selected node
    no_of_nodes=0
    w=0 #weight of minimum spanning tree
    while(no_of_nodes<N-1):
        minimum=9999999
        for i in range(N):
            if nodes[i]:
                for j in range(N):
                    if (not nodes[j]) and matrix[i][j]:  #if node is not selected and there is an edge with selected node
                        if matrix[i][j]<minimum:
                            minimum=matrix[i][j]
                            nodes[j]=True
                            w+=minimum
                            no_of_nodes+=1
                            break  #only one node to be selected so that there is no cycle

    return w   #return weight of the minimum path


In [None]:
#circuitousness is the ratio of actual distance travelled between text points to the shortest path
#it is an optimzation problem and a modified version of TSP
def circuitousness(avg_vectors):
    dist=0
    for i in range(len(avg_vectors)):
        dist+=distance(avg_vectors[i], avg_vectors[i-1])

    adj_matrix=create_adj_matrix(avg_vectors)
    weight=prim_alg(adj_matrix)
    circuitousness=dist/weight
    return circuitousness

###### This is an open source dataset composed of millions of news articles mostly scraped from a curated list of 1001 domains from http://www.opensources.co/. Because the list does not contain many reliable  websites, additionally NYTimes and WebHose English News Articles articles has been included to better balance the classes.

In [None]:
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z01
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z02
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z03
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z04
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z05
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z06
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z07
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z08
!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z09

!wget https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.zip


--2024-02-11 08:42:15--  https://github.com/several27/FakeNewsCorpus/releases/download/v1.0/news.csv.z01
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/119894144/6c9e1400-3ec5-11ea-8eab-9942584ac3db?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240211%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240211T084215Z&X-Amz-Expires=300&X-Amz-Signature=cc083bef6b0eee5eaf5cd0214038863647d9286f67156c448fe1fec454b03816&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=119894144&response-content-disposition=attachment%3B%20filename%3Dnews.csv.z01&response-content-type=application%2Foctet-stream [following]
--2024-02-11 08:42:15--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/119894144/6c9e1400-3ec5-11ea-8eab-9942584ac3db?X-Amz-Al

In [None]:
!zip -F news.csv.zip --out fixed.zip

In [None]:
!unzip fixed.zip

In [None]:
df = pd.read_csv("/content/news_cleaned_2018_02_13.csv", nrows = 100000, engine = 'python', encoding = 'utf-8', encoding_errors = 'ignore', on_bad_lines = 'skip')
df.head()

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0,2,express.co.uk,rumor,https://www.express.co.uk/news/science/738402/...,"Life is an illusion, at least on a quantum lev...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Is life an ILLUSION? Researchers prove 'realit...,Sean Martin,,[''],THE UNIVERSE ceases to exist when we are not l...,,,
1,1,6,barenakedislam.com,hate,http://barenakedislam.com/category/donald-trum...,"Unfortunately, he hasn’t yet attacked her for ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",,[''],,,,
2,2,7,barenakedislam.com,hate,http://barenakedislam.com/category/donald-trum...,The Los Angeles Police Department has been den...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Donald Trump,"Linda Rivera, Conrad Calvano, Az Gal, Lincoln ...",,[''],,,,
3,3,8,barenakedislam.com,hate,http://barenakedislam.com/2017/12/24/more-winn...,The White House has decided to quietly withdra...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"MORE WINNING! Israeli intelligence source, DEB...","Cleavis Nowell, Cleavisnowell, Clarence J. Fei...",,[''],,,,
4,4,9,barenakedislam.com,hate,http://barenakedislam.com/2017/12/25/oh-trump-...,“The time has come to cut off the tongues of t...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"“Oh, Trump, you coward, you just wait, we will...","F.N. Lehner, Don Spilman, Clarence J. Feinour,...",,[''],,,,


In [None]:
df_news = df[["domain", "type", "content", "title"]]
df_news.isna().sum()

domain        0
type       3511
content       0
title        31
dtype: int64

In [None]:
df_news.dropna(axis = 0, inplace = True)
df_news.reset_index()
df_news.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_news.dropna(axis = 0, inplace = True)


domain     0
type       0
content    0
title      0
dtype: int64

In [None]:
print("-----------------------------------NEWS WITH FAKE TAG----------------------------------------")

fake = df_news.loc[df_news["type"] == "fake"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for i, data in fake.iterrows():
    avg_vectors=[]
    windows=break_into_windows(data["content"].strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(data["title"])
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")

    print("")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Instagram App For Android Is Better Than iPhone One, Say Developers
Speed: 0.6014182865619659
Circuitousness: 1.085145336162161
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 68976706558.59047

Download Android Apps APK To Desktop Using APK Downloader Extension For Chrome
Speed: 0.6178776296702299
Circuitousness: 1.1238982752136626
Positive Count: 0
Negative Count: 2
Sentiment Score: -0.08333333333333333
volume: 6965956849.401455

Samsung Galaxy S 3 Display Will Be A Super AMOLED HD Plus Of 4.8″
Speed: 0.5938021513548765
Circuitousness: 1.1553841736667043
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 4749435840.156563

PC Koubou launches the Lesance NB P3534-SP
Speed: 0.7474018692970276
Circuitousness: 1.0518811505586538
Positive Count: 0
Negative Count: 2
Sentiment Score: -0.08333333333333333
volume: 66269718.568380006

Fujitsu launches the LIFEBOOK AH56/G
Speed: 0.7733812093734741
Circui

In [None]:
fake_scores = pd.DataFrame()
fake_scores['title']= fake['title']
fake_scores["content"] = fake['content']
fake_scores['speed']= speed__
fake_scores['circuitousness'] = circuitousness__
fake_scores['sentiment_score'] = sentiment
fake_scores['volume'] = volume

# fake.to_csv("scores.csv")

In [None]:
print("-----------------------------------NEWS WITH HATE TAG----------------------------------------")

hate = df_news.loc[df_news["type"] == "hate"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(hate['content'], hate['title']):
    avg_vectors=[]
    windows=break_into_windows(text.strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

-----------------------------------NEWS WITH HATE TAG----------------------------------------
Donald Trump
Speed: 0.9824905769049499
Circuitousness: 1.3966052227176344
Positive Count: 0
Negative Count: 1
Sentiment Score: -0.16666666666666666
volume: 23.4763127562263

Donald Trump
Speed: 0.9539715143670847
Circuitousness: 1.358464991869609
Positive Count: 1
Negative Count: 0
Sentiment Score: 0.1111111111111111
volume: 23.42965445821037

MORE WINNING! Israeli intelligence source, DEBKAfile, confirms the Trump Administration to cut off ties with the Palestinians, which means no peace plan, no more financial aid
Speed: 0.6216347034160907
Circuitousness: 1.0377856876279115
Positive Count: 0
Negative Count: 3
Sentiment Score: -0.1
volume: 574416146194.7621

“Oh, Trump, you coward, you just wait, we will dig your grave by means of the Islamic Caliphate”
Speed: 0
Circuitousness: nan
Positive Count: 3
Negative Count: 2
Sentiment Score: 0.05263157894736842
volume: 0

Following Guatemala’s decisi

In [None]:
hate_scores=pd.DataFrame()
hate_scores['title']=hate['title']
hate_scores['content'] = hate['content']
hate_scores['speed']=speed__
hate_scores['circuitousness']=circuitousness__
hate_scores['sentiment_score']=sentiment
hate_scores['volume']=volume
# hate_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/hate_scores.csv")

In [None]:
print("-----------------------------------NEWS WITH JUNKSCIENCE TAG----------------------------------------")

junkscience=df_news.loc[df_news["type"] == "junksci"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(junkscience['content'], junkscience['title']):
    avg_vectors=[]
    windows=break_into_windows(text.strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Pine bark news, articles and information:
Speed: 1.0317048761579726
Circuitousness: 1.0885047662786334
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 760045.1985717936

pesticide articles and information on the NaturalNews Network, the independent health news source
Speed: 0.3312976658344269
Circuitousness: 1.5184840019412575
Positive Count: 6
Negative Count: 0
Sentiment Score: 0.09836065573770492
volume: 6.283185307179586

Washington rejects restriction on honeybee-killing garden pesticide despite evidence of its deadliness
Speed: 0.39617596566677094
Circuitousness: 1.484753957854454
Positive Count: 0
Negative Count: 6
Sentiment Score: -0.2
volume: 6.283185307179587

Argentinians protest Monsanto as pesticide usage increases rates of birth defects, cancer
Speed: 0.3493105322122574
Circuitousness: 1.5596690122468029
Positive Count: 4
Negative Count: 7
Sentiment Score: -0.04
volume: 6.283185307179586

Diabe

In [None]:
junkscience_scores=pd.DataFrame()
junkscience_scores['title']=junkscience['title']
junkscience_scores['speed']=speed__
junkscience_scores['circuitousness']=circuitousness__
junkscience_scores['sentiment_score']=sentiment
junkscience_scores['volume']=volume
# junkscience_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/junkscience_scores.csv")

In [None]:
print("-----------------------------------NEWS WITH CLICKBAIT TAG----------------------------------------")

clickbait=df_news.loc[df_news["type"] == "clickbait"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(clickbait['content'], clickbait['title']):
    avg_vectors=[]
    windows=break_into_windows(text.strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
portal.liberalamerica.org
Speed: 0
Circuitousness: nan
Positive Count: 2
Negative Count: 1
Sentiment Score: 0.08333333333333333
volume: 0

portal.liberalamerica.org
Speed: 1.5330308145691507
Circuitousness: 1.6172246109082866
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 16.34168409184703

Christian Militia On Facebook Claims Authority To Shoot & Kill Obama
Speed: 1.2411083404558703
Circuitousness: 1.2844687148147007
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 2140.9907240429984

Could Google Maps Be A Threat To National Security? Just Ask The Secret Service
Speed: 0.29452815651893616
Circuitousness: 1.5929342007195746
Positive Count: 6
Negative Count: 7
Sentiment Score: -0.008928571428571428
volume: 6.283185307179586

Internet security Archives
Speed: 0.9034642228991567
Circuitousness: 1.1820485970189945
Positive Count: 2
Negative Count: 0
Sentiment Score: 0.18181818181818182
volume: 3

In [None]:
clickbait_scores=pd.DataFrame()
clickbait_scores['title']=clickbait['title']
clickbait_scores['speed']=speed__
clickbait_scores['circuitousness']=circuitousness__
clickbait_scores['sentiment_score']=sentiment
clickbait_scores['volume']=volume
# clickbait_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/clickbait_scores.csv")

In [None]:
print("-----------------------------------NEWS WITH POLITICAL TAG----------------------------------------")

political = df_news.loc[df_news["type"] == "political"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(political['content'], political['title']):
    avg_vectors=[]
    windows=break_into_windows(text.strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Reject and Protect (#NoKXL): Strength in Community on the National Mall, Sat 4
Speed: 0.3803880959749222
Circuitousness: 1.4231891381770692
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 767.0271198155233

Insomniacs' Venthole, hooray for Ducks edition
Speed: 0.5687292218208313
Circuitousness: 1.1754432505334038
Positive Count: 1
Negative Count: 1
Sentiment Score: 0.0
volume: 651994905.9572816

The Seven Wonders of the Modern World
Speed: 0.7422399401664734
Circuitousness: 1.1161497895688055
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 55806082.936524026

Overnight News Digest: June 29
Speed: 0.40844133868813515
Circuitousness: 1.062813793911169
Positive Count: 4
Negative Count: 1
Sentiment Score: 0.07142857142857142
volume: 15519784.457942937

Please rec this diary!
Speed: 0
Circuitousness: nan
Positive Count: 6
Negative Count: 1
Sentiment Score: 0.22727272727272727
volume: 0

Daily Kos:

In [None]:
political_scores=pd.DataFrame()
political_scores['title']=political['title']
political_scores['speed']=speed__
political_scores['circuitousness']=circuitousness__
political_scores['sentiment_score']=sentiment
political_scores['voolume']=volume
# political_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/political_scores.csv")

In [None]:
print("-----------------------------------NEWS WITH SATIRE TAG----------------------------------------")

satire = df_news.loc[df_news["type"] == "satire"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(satire['content'], satire['title']):
    avg_vectors=[]
    windows=break_into_windows(text.strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

-----------------------------------NEWS WITH SATIRE TAG----------------------------------------
malcolm turnbull satire – The Shovel
Speed: 0
Circuitousness: nan
Positive Count: 0
Negative Count: 2
Sentiment Score: -0.18181818181818182
volume: 0

malcolm turnbull jokes – The Shovel
Speed: 0
Circuitousness: nan
Positive Count: 0
Negative Count: 2
Sentiment Score: -0.18181818181818182
volume: 0

Australian political satire – The Shovel
Speed: 0
Circuitousness: nan
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 0

comedy website australia – The Shovel
Speed: 0
Circuitousness: nan
Positive Count: 1
Negative Count: 0
Sentiment Score: 0.07692307692307693
volume: 0

Serendipity: This Man Made Up An Entire Personality For His Regular Barista, And She’s Perfect For Him
Speed: 0.27271299064159393
Circuitousness: 1.6543643770231071
Positive Count: 6
Negative Count: 1
Sentiment Score: 0.16666666666666666
volume: 6.283185307179587

Australian satire – The Shovel
Speed: 0
Circuitousn

In [None]:
satire_scores=pd.DataFrame()
satire_scores['title']=satire['title']
satire_scores['speed']=speed__
satire_scores['circuitousness']=circuitousness__
satire_scores['sentiment_score']=sentiment
satire_scores['volume']=volume
# satire_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/satire_scores.csv")

In [None]:
print("-----------------------------------NEWS WITH CONSPIRACY THEORY TAG----------------------------------------")

conspiracy_theory = df_news.loc[df_news["type"] == "conspiracy"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(conspiracy_theory['content'], conspiracy_theory['title']):
    avg_vectors=[]
    windows=break_into_windows(text.strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Extremely Leveraged
Speed: 0
Circuitousness: nan
Positive Count: 1
Negative Count: 3
Sentiment Score: -0.2222222222222222
volume: 0

Financial Institutions In The United States
Speed: 0
Circuitousness: nan
Positive Count: 1
Negative Count: 3
Sentiment Score: -0.2222222222222222
volume: 0

Leveraged
Speed: 0.3768829759210348
Circuitousness: 1.2280774011135092
Positive Count: 1
Negative Count: 1
Sentiment Score: 0.0
volume: 38975771.4826354

Simon Black
Speed: 0.31405453085899354
Circuitousness: 1.2390565749072184
Positive Count: 2
Negative Count: 1
Sentiment Score: 0.022727272727272728
volume: 15589.240825500669

Debt Not Payable
Speed: 0
Circuitousness: nan
Positive Count: 1
Negative Count: 3
Sentiment Score: -0.2222222222222222
volume: 0

Finances
Speed: 0.3326485926906268
Circuitousness: 1.1941020090072263
Positive Count: 1
Negative Count: 2
Sentiment Score: -0.0625
volume: 310661.6034577012

Debt Crisis In Puerto Rico


In [None]:
conspiracy_theory_scores=pd.DataFrame()
conspiracy_theory_scores['title']=conspiracy_theory['title']
conspiracy_theory_scores['speed']=speed__
conspiracy_theory_scores['circuitousness']=circuitousness__
conspiracy_theory_scores['sentiment_score']=sentiment
conspiracy_theory_scores['volume']=volume
# conspiracy_theory_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/conspiracy_theory_scores.csv")

In [None]:
print("-----------------------------------NEWS WITH RELIABLE TAG------------------------------------")

reliable = df_news.loc[df_news["type"] == "reliable"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(reliable['content'], reliable['title']):
    avg_vectors=[]
    windows=break_into_windows(str(text))
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

-----------------------------------NEWS WITH RELIABLE TAG------------------------------------
'Destiny 2' News: Bungie Addresses Player Complaints Around Faction Rally's Redundant Rewards
Speed: 0.5150249627503481
Circuitousness: 1.1007248709985926
Positive Count: 2
Negative Count: 0
Sentiment Score: 0.16666666666666666
volume: 13969293533.708353

Professor Conducts Libation to Summon Spirits of Blacks Killed by Police in Hollywood Church
Speed: 0.43737322092056274
Circuitousness: 1.341246010774833
Positive Count: 2
Negative Count: 3
Sentiment Score: -0.02702702702702703
volume: 51.77971235004652

'Ghost Ship' With 8 Dead N. Koreans Washes Up in Japan During 'Unified Korea' Flag Announcement
Speed: 0.4601002037525177
Circuitousness: 1.5568538711476843
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 6.283185307179586

Facebook Lifts Ban on Paid Ads for Pro-Life Film Produced by MLK's Niece After Appeal
Speed: 0.3228447238604228
Circuitousness: 1.3449504883381085
Positive 

In [None]:
reliable_scores=pd.DataFrame()
reliable_scores['title']=reliable['title']
reliable_scores['speed']=speed__
reliable_scores['circuitousness']=circuitousness__
reliable_scores['sentiment_score']=sentiment
reliable_scores['voolume']=volume
# reliable_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/reliable_scores.csv")

In [None]:
print("-----------------------------------NEWS WITH UNRELIABLE TAG----------------------------------")

unreliable = df_news.loc[df_news["type"] == "unreliable"]


speed__=[]
sentiment=[]
circuitousness__=[]
volume=[]
for text, title in zip(unreliable['content'], unreliable['title']):
    avg_vectors=[]
    windows=break_into_windows(text.strip())
    for window in windows:
        tokens=tokenize(window)
        vectors=convert_to_vectors(tokens)
        avg_vectors.append(avg_text_window(vectors))
        preprocessed_text=preprocess_text(tokens)
    speed_=speed(avg_vectors)
    speed__.append(speed_)

    sentiment_=sentiment_count(preprocessed_text)
    sentiment.append(sentiment_)

    circuitousness_=circuitousness(avg_vectors)
    circuitousness__.append(circuitousness_)

    if len(avg_vectors)>2:
        volume_=calculate_volume(avg_vectors)
    else: volume_=0
    volume.append(volume_)


    print(title)
    print(f"Speed: {speed_}")
    print(f"Circuitousness: {circuitousness_}")
    print(f"Positive Count: {pos_count(preprocessed_text)}")
    print(f"Negative Count: {neg_count(preprocessed_text)}")
    print(f"Sentiment Score: {sentiment_}")
    print(f"volume: {volume_}")
    print("")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
BreakPoint: Why Oprah Might Be President
Speed: 0.3622129261493683
Circuitousness: 1.4392962262407571
Positive Count: 10
Negative Count: 8
Sentiment Score: 0.01818181818181818
volume: 6.283185307179586

Roberto Rivera, Author at Break Point
Speed: 0
Circuitousness: nan
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 0

Jim Tonkowich, Author at Break Point
Speed: 0
Circuitousness: nan
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 0

J. Warner Wallace, Author at Break Point
Speed: 0
Circuitousness: nan
Positive Count: 0
Negative Count: 1
Sentiment Score: -0.08333333333333333
volume: 0

Glenn Stanton, Author at Break Point
Speed: 0
Circuitousness: nan
Positive Count: 3
Negative Count: 0
Sentiment Score: 0.2
volume: 0

Emily Colson, Author at Break Point
Speed: 0
Circuitousness: nan
Positive Count: 0
Negative Count: 0
Sentiment Score: 0
volume: 0

Heritage Chooses, Chick-fil-A Serves, Religion 

In [None]:
unreliable_scores=pd.DataFrame()
unreliable_scores['title']=unreliable['title']
unreliable_scores['speed']=speed__
unreliable_scores['circuitousness']=circuitousness__
unreliable_scores['sentiment_score']=sentiment
unreliable_scores['voolume']=volume
# unreliable_scores.to_csv("C:/Users/PC/OneDrive/Desktop/corpus/unreliable_scores.csv")

In [None]:
df_news["type"].value_counts()

fake          45768
political     27368
bias           9123
conspiracy     6875
junksci        2204
clickbait      1979
unknown        1057
unreliable     1031
satire          344
hate            298
reliable        289
rumor           124
Name: type, dtype: int64

In [None]:
#some text ideas for analysis-
#speeches
#youtube vidoes (more views, more popular)
#compare different news articles or transcripts of news videos and compare which of them are more viral and if the speed, volume, circuitousness has anything to do woth it
#courses, and if their popularity have anything to do with these facros(speed, volume, circuitousness)
