Run all commands listed in requirements.txt, proper format wasn't working from pipreqs

Also run the commented out nltk import & downloads exactly once to use newspaper's nlp features

In [73]:
import newspaper
from nela_features.nela_features import NELAFeatureExtractor
import numpy as np
import pandas as pd
import jellyfish
import tldextract
import pickle
# Run the below once
#import nltk
#nltk.download('punkt_tab')
#nltk.download('punkt')

In [74]:
def get_article_text(url):
    return newspaper.article(url).text

def smog_to_text(smog):
    if smog >= 17:
        return "Graduate"
    if smog >= 13:
        return "Undergraduate"
    if smog >= 9:
        return "High School"
    if smog >= 5:
        return "Middle School"
    else:
        return "Elementary School"
        
def get_nela_smog_text(text):
    nela = NELAFeatureExtractor()
    complexity_vector, complexity_names = nela.extract_complexity(text) 
    return smog_to_text(complexity_vector[4])
    
def get_nela_smog(url):
    text = get_article_text(url)
    return get_nela_smog_text(text)

In [75]:
glove_embeddings = {}
print("Loading glove embeddings")
with open('glove.6B.100d.txt', 'r',encoding='utf8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        glove_embeddings[word] = vector
test_embeddings = [
    # Socioeconomic status
    {'name': 'rich/poor', 'dir1': ["rich", "wealthy", "affluent"], "dir2": ["poor", "impoverished", "destitute"]},
    
    # Age bias
    {'name': 'young/old', 'dir1': ["young", "youthful", "vibrant"], "dir2": ["old", "elderly", "aged"]},
    
    # Gender stereotypes (roles)
    {'name': 'male/female stereotypes', 'dir1': ["leader", "strong", "assertive"], "dir2": ["nurturing", "caring", "supportive"]},
    
    # Rural vs. Urban bias
    {'name': 'rural/urban', 'dir1': ["urban", "city"], "dir2": ["rural", "countryside"]},
    
    # Employment bias (white-collar vs. blue-collar)
    {'name': 'white-collar/blue-collar', 'dir1': ["professional", "educated", "executive"], "dir2": ["manual", "laborer", "working-class"]},
    
    # Intelligence perception
    {'name': 'smart/dumb', 'dir1': ["smart", "intelligent"], 'dir2': ["dumb", "stupid"]},
]        
def find_embedding_features(inp, glove_embeddings):
    # Find the average embedding of the sentence
    words = inp.split()
    embedding = np.zeros(len(glove_embeddings['the']))
    
    for word in words:
        if word.lower() in glove_embeddings:
            embedding += glove_embeddings[word.lower()]
    embedding /= len(words)
    
    # Now find all cosine similarities to the difference between dir1 and dir2
    embedding_features = []
    embedding_names = []
    for test_embedding in test_embeddings:
        net_dir = np.zeros(len(glove_embeddings['the']))
        for word in test_embedding['dir1']:
            net_dir += glove_embeddings[word]
        for word in test_embedding['dir2']:
            net_dir -= glove_embeddings[word]
        net_dir /= len(test_embedding['dir1']) + len(test_embedding['dir2'])
        
        # Find the cosine similarity
        cos_sim = np.dot(embedding, net_dir) / (np.linalg.norm(embedding) * np.linalg.norm(net_dir))

        embedding_features.append(cos_sim)
        embedding_names.append(test_embedding['name'])
        
    return embedding_features, embedding_names
    
def embedding_vector(text,glove_embeddings):
    embedding_features, embedding_names = find_embedding_features(text, glove_embeddings)
    return embedding_features

Loading glove embeddings


In [96]:
# using allsides ratings dataset found at https://www.kaggle.com/datasets/supratimhaldar/allsides-ratings-of-bias-in-electronic-media
def get_allsides(url):
    allsides = pd.read_csv('dataset/allsides.csv')
    parsed = tldextract.extract(url)
    website = parsed.domain
    allsides_vals = allsides.values
    sources = allsides_vals[:,0]
    dict = {'center': 50, 'left-center': 25, 'left': 5, 'right-center': 75, 'right': 95}
    rows = [row for row in allsides_vals if website.lower() in row[0].lower()]
    if rows == []:
        rows = [row for row in allsides_vals if website.lower()[:3] in row[0].lower() and website.lower()[:3] != 'the']
    if rows != []:
        rows.sort(key = lambda x: x[2],reverse=True)
        return dict[rows[0][1]]
    #distances = [jellyfish.levenshtein_distance(website.lower(), x.lower()) for x in sources]
    #loc = distances.index(min(distances))
    distances = [jellyfish.jaro_similarity(website.lower(), x.lower()) for x in sources]
    loc = distances.index(max(distances))
    source_allsides_format = sources[loc]
    row = allsides_vals[loc]
    return dict[row[1]]
    
def get_knn_class_text(text):
    with open('knnfakenews.pkl', 'rb') as f:
        knn = pickle.load(f)
    nela = NELAFeatureExtractor()
    feature_vector, feature_names = nela.extract_all(text)
    feature_vector = feature_vector + embedding_vector(text,glove_embeddings)
    vector = [[feature_vector[i] for i in [89, 92, 4, 59, 24]]]
    return True if knn.predict(vector) == [1] else False
    
def get_knn_class(url):
    text = get_article_text(url)
    return get_knn_class_text(text)

In [97]:
print(get_allsides("https://www.nytimes.com/2024/11/26/world/middleeast/israel-oct-7-inquiry.html"))

['New York Times (News)' 'left-center' 67516 28834 38682
 0.7454113024145598 'Somewhat Disagrees'
 'https://www.allsides.com/news-source/new-york-times']
left-center
New York Times (News)
8
hello
25


In [98]:
url = "https://www.nytimes.com/2024/11/26/world/middleeast/israel-oct-7-inquiry.html"
print(get_knn_class(url))
print(get_allsides(url))

True
['New York Times (News)' 'left-center' 67516 28834 38682
 0.7454113024145598 'Somewhat Disagrees'
 'https://www.allsides.com/news-source/new-york-times']
left-center
New York Times (News)
8
hello
25


In [99]:
url = "https://theonion.com/announcement-of-fourth-child-contains-conspicuous-lack-of-exclamation-points/"
print(get_knn_class(url))
print(get_nela_smog(url))
print(get_allsides(url))

False
Middle School
['The Nation' 'left' 2091 1599 492 3.25 'Absolutely Agrees'
 'https://www.allsides.com/news-source/nation-media-bias']
left
The Nation
8
hello
5


In [100]:
url = "https://www.cbsnews.com/news/tom-homan-greg-abbott-texas-border-visit/"
print(get_knn_class(url))
print(get_allsides(url))

True
['CBS News (Online)' 'left-center' 30930 18873 12057 1.5653147549141575
 'Agrees' 'https://www.allsides.com/news-source/cbs-news-media-bias']


In [101]:
url = "https://skepticalinquirer.org/exclusive/are-saunas-good-for-you-yes-but/"
print(get_knn_class(url))
print(get_allsides(url))

False
['Splinter' 'left' 381 257 124 2.0725806451612905 'Strongly Agrees'
 'https://www.allsides.com/news-source/splinter-media-bias']
left
Splinter
8
hello
5


In [102]:
url = 'https://jacobin.com/2024/11/trump-biden-israel-middle-east'
print(get_knn_class(url))
print(get_allsides(url))

True
['Jacobin' 'left' 1210 974 236 4.127118644067797 'Absolutely Agrees'
 'https://www.allsides.com/news-source/jacobin-media-bias']


In [103]:
url = 'https://www.cnn.com/2023/10/24/politics/supreme-court-florida-anti-drag-law/index.html'
print(get_knn_class(url))
print(get_allsides(url))

True
['CNN (Online News)' 'left' 98448 50713 47735 1.0623860898711637
 'Somewhat Agrees' 'https://www.allsides.com/news-source/cnn-media-bias']


In [104]:
url = 'https://www.forbes.com/sites/alexkonrad/2024/11/27/elon-musk-doge-attracts-young-coders-and-tech-ceos/'
print(get_knn_class(url))
print(get_allsides(url))

True
['Forbes' 'center' 21999 13638 8361 1.6311445999282383 'Agrees'
 'https://www.allsides.com/news-source/forbes']


In [105]:
url = 'https://climatesciencenews.com/2024-11-26-appalachia-hurricane-helene-lithium-green-new-deal.html'
print(get_knn_class(url))
print(get_allsides(url))

False
['Deseret News' 'right-center' 3225 1316 1909 0.6893661602933473
 'Somewhat Disagrees' 'https://www.allsides.com/news-source/deseret-news']
right-center
Deseret News
8
hello
75
