In [1]:
from urllib.parse import urlparse

from googletrans import Translator
from summarizer import Summarizer
from waybackpy import WaybackMachineSaveAPI

# from core.decorators import clean_text


def get_domain(url: str) -> str:
    """
    get_domain returns the domain of the url.

    Parameters
    ----------
    url : str
        The url to be checked.

    Returns
    -------
    str
        The domain of the url.
    """

    return urlparse(url).netloc


# @clean_text
def to_english(text: str) -> str:
    """
    to_english translates text to english if it is not already in english.

    Parameters
    ----------
    text : str
        The text to be translated.

    Returns
    -------
    str
        The text translated to english.
    """

    translator = Translator()
    if translator.detect(text).lang == "en":
        return text
    obj = translator.translate(text)
    return obj.text


# @clean_text
def summarize(text: str) -> str:
    """
    summarize summarizes text.

    Parameters
    ----------
    text : str
        The text to be summarized.

    Returns
    -------
    str
        The 3 sentence summary of the text.
    """

    model = Summarizer()
    summary = model(text, ratio=0.25, use_first=False)
    return summary


def archiveURL(url: str) -> str:
    """
    archiveURL returns the archive url of given url

    Parameters
    ----------
    url : str
        The url to be archived.

    Returns
    -------
    str
        The archive url of the given url.
    """

    user_agent = "Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405"
    save_api = WaybackMachineSaveAPI(url=url, user_agent=user_agent, max_tries=12)
    archive_url = save_api.save()
    return archive_url


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
get_domain("https://mui.com/material-ui/guides/next-js-app-router/")

'mui.com'

In [3]:
to_english("Hola, como estas?")

'Hello how are you?'

In [4]:
summary = summarize("By November 1932, the Nazi Party held the most seats in the German Reichstag but did not have a majority. As a result, no party was able to form a majority parliamentary coalition in support of a candidate for chancellor. The former chancellor Franz von Papen and other conservative leaders persuaded President Paul von Hindenburg to appoint Hitler as chancellor on 30 January 1933. Shortly after, the Reichstag passed the Enabling Act of 1933 which began the process of transforming the Weimar Republic into Nazi Germany, a one-party dictatorship based on the totalitarian and autocratic ideology of Nazism. On 2 August 1934, Hindenburg died and Hitler replaced him as the head of state and government. Hitler aimed to eliminate Jews from Germany and establish a New Order to counter what he saw as the injustice of the post-World War I international order dominated by Britain and France. His first six years in power resulted in rapid economic recovery from the Great Depression, the abrogation of restrictions imposed on Germany after World War I, and the annexation of territories inhabited by millions of ethnic Germans, which initially gave him significant popular support.")
print(summary)
print(type(summary))
print(len(summary))

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  super()._check_params_vs_input(X, default_n_init=10)


The former chancellor Franz von Papen and other conservative leaders persuaded President Paul von Hindenburg to appoint Hitler as chancellor on 30 January 1933.
<class 'str'>
160


In [5]:
archiveURL("https://blog.siddhesh.tech/")

'https://web.archive.org/web/20230717094455/https://blog.siddhesh.tech/'

In [7]:
import pickle
import pandas as pd


def isPhishing(url: str) -> bool:
    """
    isPhishing checks if the url is a phishing url.

    Parameters
    ----------
    url : str
        The url to be checked.

    Returns
    -------
    bool
        True if the url is a phishing url, False otherwise.
    """

    model = pickle.load(open("./models/model.pkl", "rb"))
    prediction = model.predict([url])
    return prediction[0] == "good"


def isCredible(url: str) -> bool:
    """
    isCredible checks if the url is a credible url.

    Parameters
    ----------
    url : str
        The url to be checked.

    Returns
    -------
    bool
        True if the url is a credible url, False otherwise.
    """

    domain = get_domain(url)
    df: pd.DataFrame = pd.read_csv("./assets/sources.csv", engine="pyarrow")[["domain"]]
    row = df.loc[df["domain"] == domain]
    if row.empty:
        return isPhishing(url)
    return "fake" not in [
        str(row["type1"]).lower(),
        str(row["type2"]).lower(),
        str(row["type3"]).lower(),
    ]


In [9]:
isPhishing("https://wikipedia.org")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


True