### Setup

In [None]:
!sudo apt install tesseract-ocr -y

In [None]:
!pip install textstat spacytextblob transformers

### Importing Dependencies

In [8]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import socket

from tqdm import tqdm
import requests

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import urllib

from urllib.parse import urlparse
from http.client import HTTPConnection, HTTPSConnection

import re

import spacy
import urllib.request

from unicodedata import normalize
import cv2

from transformers import AutoTokenizer, AutoModelWithLMHead

In [None]:
!unzip /content/drive/MyDrive/Deceptive-Research/annotation.zip

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

nlp.add_pipe('spacytextblob')

### Cleaning the text Data


In [7]:
def clean_text(text):

    text = normalize("NFKD", text) #Normalization

    text = re.sub(r"[^\w\s]","", text) #Remove Punc

    # text = " ".join([token.lemma_ for token in nlp(text) if not token.is_stop])

    text = re.sub("\s+", " ", text)

    text = text.strip()

    return text

### Cleaning the image

In [9]:
def clean_img(img):

    img = cv2.resize(img, (300, 300))

    # Normalization
    img = img/255.0

    return img

### Features Extraction

Summarize the webpage using T5

In [None]:
tokenizer=AutoTokenizer.from_pretrained('T5-base')
model=AutoModelWithLMHead.from_pretrained('T5-base', return_dict=True)

In [None]:
def summarize_webpage(sequence):

  inputs = tokenizer.encode("sumarize: " +sequence,return_tensors='np', max_length=512, truncation=True)
  output = model.generate(inputs, min_length=80, max_length=100)

  summary = tokenizer.decode(output[0])

  return summary

### OCR - Image to Text

In [12]:
def get_data(image):

    txt = pytesseract.image_to_string(image, lang="eng")
    txt = re.sub("[\n]{2,}", "\t\t", txt)
    txt = re.sub("\n", "", txt)
    txt = re.sub("\t\t", "\n", txt)

    if not txt:
      txt = "No Information"

    return txt

### Cosine Similarity

In [11]:
def cos_similarity(text1, text2):

  doc1 = nlp(text1)
  doc2 = nlp(text2)

  return doc1.similarity(doc2)

### Text Sentiment

In [10]:
def sentiment(text):

  doc = nlp(text)

  return doc._.blob.polarity

### Is Secure

In [None]:
def check_https_url(url):
    HTTPS_URL = f'https://{url}'
    try:
        HTTPS_URL = urlparse(HTTPS_URL)
        connection = HTTPSConnection(HTTPS_URL.netloc, timeout=2)
        connection.request('HEAD', HTTPS_URL.path)
        if connection.getresponse():
            return 1
        else:
            return 0
    except:
        return 0


### Host Name Feature

In [None]:
def get_host(url):
  res = re.findall("^(www.|https://|http://)", url)
  if res:
    url = re.sub(f"^{res[0]}", "", url)
  try:
    socket.gethostbyname(url)
    return 1
  except:
    return 0


Active/Inactive

In [None]:
def is_active(url):

  url = url if url[:8] in ["http://", "https://"] else "http://"+url
  try:
    r = requests.head(url, timeout=3)

    if r.status_code == 200: return 1
    else: return 0
  except: return 0


### Check number of Re-directions a URL has

In [None]:
def check_redirect(url):
  try:
    url = url if url[:8] in ["http://", "https://"] else "http://"+url
    r = requests.get(url, timeout=3)
    return len(r.history)
  except:
    return 0

### Annotated Data Extraction and derive the features.

In [None]:
def extract_data(soup):

  imgs = []
  img_txts = []
  y_true = []
  cos_sims = []


  text = summarize_webpage(soup)

  # find all images in URL
  images = soup.findAll('img', alt=True)

  # checking if images is not zero
  if len(images) != 0:
    for i, image in enumerate(images):

      try:
        image_link = image["data-srcset"]

      except:
        try:

          image_link = image["data-src"]

        except:
          try:

            image_link = image["data-fallback-src"]
          except:
            try:

              image_link = image["src"]

            except Exception as e:
              print(f"Error: {e}")

      try:

        alt_text = image["alt"]

        if alt_text in ["deceptive", "normal"]:

          y = 1 if alt_text == "deceptive" else 0

          if Path(image_link).suffix ==  ".svg":
            img_png = cairosvg.svg2png(url = image_link)
            img = plt.imread(BytesIO(img_png))[:,:,:3]
            img = np.array(Image.fromarray((img * 255).astype(np.uint8)))

          else: img = plt.imread(image_link)

          img_txt = get_data(img)
          img = clean_img(img)
          cos_sim = cos_similarity(text, img_txt)
          sentiment = sentiment(text)
          read = readability(text)

          imgs.append(img)
          img_txts.append(img_txt)
          cos_sims.append(cos_sim)
          y_true.append(y)

      except Exception as e:
        print(f"Error: {e}")

  return imgs, img_txts, cos_sims, y_true, sentiment, read

### Main function to build the dataset

In [None]:
def main(f):

  with open(f, "rb") as f:
    html = f.read().decode('utf-8')

  soup = BeautifulSoup(html, 'html.parser')

  # Call folder create function
  return extract_data(soup)


os.chdir("/content/annotation")

X_img = []
X_txt = []
X_cos = []
y = []
sentiment =[]
read_score = []

for f in list(Path('/content/annotation').glob('*.html')):
  img, txt, cos_sim, label, polarity, score = main(f)
  X_img.extend(img)
  X_txt.extend(txt)
  X_cos.extend(cos_sim)
  y.extend(label)
  sentiment.extend(polarity)
  read_score.extend(score)

### Extras

In [None]:
len(y)

In [None]:
X_cos

In [None]:
with open("/content/annotation/What is The Rock Workout Routine_ - SET FOR SET.html", "r") as f:
  html = f.read()

soup = BeautifulSoup(html, 'html.parser')

for script in soup(["script", "style"]):
    script.extract()    # rip it out

# get text
text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())

# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

# drop blank lines
text = ' '.join(chunk for chunk in chunks if chunk)

print(text)
