In [3]:
import joblib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import contractions
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from urllib.error import HTTPError
from urllib.error import URLError
from http.client import IncompleteRead

In [6]:
# Load the trained model and TF-IDF Vectorizer
model = joblib.load('model.pkl')
tfidf_vectorizer = joblib.load('tf.pkl')

In [10]:
def get_title_and_body(URL):
    html = urlopen(URL).read().decode("utf-8")
    htmlParse = BeautifulSoup(html, 'html.parser')
    title = htmlParse.find("title").get_text()
    body = ""
    for p in htmlParse.find_all("p"):
        text = p.get_text()   
        body += text  
    all_text = title + body
    return all_text

def get_text(URL):
    try:
        text = get_title_and_body(URL)
    except HTTPError as err:
        text = None
    except URLError as err:
        text = None
    except IncompleteRead as err:
        text = None
    except AttributeError as err:
        if str(err) != "'NoneType' object has no attribute 'get_text'":
            text = get_title_and_body(URL)
        else:
            text = None
    if text:
        return text
    return None
    
def fix_contractions(text):
    fixed_text = []
    for word in text.split():
        fixed_text.append(contractions.fix(word))
    return " ".join(fixed_text)
#spacy.cli.download('en_core_web_sm')
def lemmatize(words):
    nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])
    #words = ' '.join([w for w in words])
    text = nlp(words)
    fixed_text = ' '.join([w.lemma_ for w in text])
    return fixed_text
#nltk.download('all', halt_on_error=False)
def remove_stopwords(words):
    stopword_list=nltk.corpus.stopwords.words('english')
    stopword_list.remove('no')
    stopword_list.remove('not')
    stopword_list.remove('nor')
    stopword_list.remove('against')
    stopword_list.remove('now')
    fixed_text = []
    for w in words.split():
        if w not in stopword_list:
            fixed_text.append(w)
    return " ".join(fixed_text)
def clean(text):
    text=text.replace('\xa0'," ")
    text = re.sub(r"(?!(?<=[a-z])'[a-z])[^\w\s]", '', text)
    text=text.lower()
    text=" ".join(text.strip().split())
    text=fix_contractions(text)
    text=remove_stopwords(text)
    text=lemmatize(text)
    return text
    
def show_features(vec, vec_test_features):
    vec_feature_names = vec.get_feature_names_out()
    feature_count = vec_test_features.toarray().sum(axis = 0)
    a = dict(zip(vec_feature_names, feature_count))
    top = []
    counter = Counter(a)
    for word, count in counter.most_common(15):
        top.append(word)
    return top
        
def predict_leaning(url):
    # Step 1: Preprocess the URL and extract the article content
    article_content = get_text(url)
    scrape = "Article Content:", article_content
    
    if article_content is not None:
        article_content = clean(article_content)
        cleaned = "Cleaned Article Content:", article_content, 'TFIDF Features:'
        # Step 2: Convert the preprocessed text to features using the TF-IDF Vectorizer
        tfidf_features = tfidf_vectorizer.transform([article_content])
        features = show_features(tfidf_vectorizer,tfidf_features)
        # Step 3: Use the trained model for prediction
        political_leaning = "Predicted Political Leaning: ", model.predict(tfidf_features)[0]
        
        # Step 4: Output the political leaning
        return scrape, cleaned, features, political_leaning
    else:
        return None
        
def output(user_input_url):
    predicted_leaning = predict_leaning(user_input_url)
    if predicted_leaning:
        return (predicted_leaning)
    else:
        return ("Error: Unable to access the text from the given URL. ",
              "Possible reasons include subscription costs and pop-up windows.")


In [12]:
# Example usage:
url = "https://www.alternet.org/msn/why-voters-are-likely-stuck-with-george-santos-for-2-years-law-professor/"
output(url)

(('Article Content:',
  "Why voters are 'likely' stuck with 'absurd' George Santos for 2 years: law professor - Alternet.orgAfter Rep. Kevin McCarthy of California finally won his uphill battle to become House speaker, a new GOP-controlled majority was seated in the U.S. House of Representatives. And one of the new House Republicans who was sworn in was Rep. George Santos, the Queens/Long Island congressman who told one life after another during his 2022 campaign — from fabricating his employment history to falsely claiming that his grandparents were Holocaust survivors. In an article published by The Bulwark on January 13, University of Baltimore law professor and former assistant U.S. attorney Kimberly Wehle laments that voters are probably going to be stuck with Santos for two years — fabrications and all.“The George Santos scandal is so absurd that the late-night hosts are struggling to mock it, the ‘Saturday Night Live’ team is probably wishing it was back on the air already, and 