In [1]:
!pip install unidecode
!pip install tldextract -q

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 8.4 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4
[K     |████████████████████████████████| 93 kB 1.5 MB/s 
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pickle
import pandas as pd
import numpy as np
import random
import sys
import os
import tldextract
import warnings
import regex as re
from typing import *

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import svm
from sklearn.svm import LinearSVC
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns 
from nltk.tokenize import RegexpTokenizer
from urllib.parse import urlparse
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
smote = SMOTE()

warnings.filterwarnings("ignore")

## Functions

In [4]:
def parse_url(url: str):
    try:
        no_scheme = not url.startswith('https://') and not url.startswith('http://')
        if no_scheme:
            parsed_url = urlparse(f"http://{url}")
            return {
                "scheme": None, # not established a value for this
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
        else:
            parsed_url = urlparse(url)
            return {
                "scheme": parsed_url.scheme,
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
    except:
        return None

In [5]:
def get_num_subdomains(netloc: str) -> int:
    subdomain = tldextract.extract(netloc).subdomain 
    if subdomain == "":
        return 0
    return subdomain.count('.') + 1

In [6]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
def tokenize_domain(netloc: str) -> str:
    split_domain = tldextract.extract(netloc)
    no_tld = str(split_domain.subdomain +'.'+ split_domain.domain)
    return " ".join(map(str,tokenizer.tokenize(no_tld)))

In [7]:
class Converter(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame.values.ravel()

# Load Data

In [8]:
with open("/content/drive/MyDrive/Master_Thesis_D_Amico/pickle/data.pkl", "rb") as f:
    data = pickle.load(f)

In [9]:
url_data = data.copy()
url_data = url_data[['Link', 'Label']]

# Perform Extraction

In [10]:
url_data["parsed_url"] = url_data.Link.apply(parse_url)

In [11]:
url_data = pd.concat([
    url_data.drop(['parsed_url'], axis=1),
    url_data['parsed_url'].apply(pd.Series)
], axis=1)

In [12]:
url_data = url_data[~url_data.netloc.isnull()]

In [13]:
url_data["length"] = url_data.Link.str.len()
url_data["tld"] = url_data.netloc.apply(lambda nl: tldextract.extract(nl).suffix)
url_data['tld'] = url_data['tld'].replace('','None')
url_data['digit'] = url_data.Link.str.count('\d')
url_data['slashes'] = url_data.path.str.count('/')
url_data['hypen'] = url_data.Link.str.count('-')

In [14]:
url_data['num_subdomains'] = url_data['netloc'].apply(lambda net: get_num_subdomains(net))

In [15]:
url_data['domain_tokens'] = url_data['netloc'].apply(lambda net: tokenize_domain(net))
url_data['path_tokens'] = url_data['path'].apply(lambda path: " ".join(map(str,tokenizer.tokenize(path))))

In [16]:
url_data.head(5)

Unnamed: 0,Link,Label,scheme,netloc,path,params,query,fragment,length,tld,digit,slashes,hypen,num_subdomains,domain_tokens,path_tokens
0,https://dailyhive.com/vancouver/ryan-reynolds-...,1,https,dailyhive.com,/vancouver/ryan-reynolds-covid-19-vaccination,,,,66,com,2,2,4,0,dailyhive,vancouver ryan reynolds covid vaccination
1,https://www.livescience.com/china-coronavirus-...,0,https,www.livescience.com,/china-coronavirus-death-toll-climbs.html,,,,68,com,0,1,4,1,www livescience,china coronavirus death toll climbs html
2,https://cenerva.com/the-5g-covid-19-quackery/,0,https,cenerva.com,/the-5g-covid-19-quackery/,,,,45,com,3,2,4,0,cenerva,the g covid quackery
3,https://pubmed.ncbi.nlm.nih.gov/32064853/,0,https,pubmed.ncbi.nlm.nih.gov,/32064853/,,,,41,gov,8,2,0,3,pubmed ncbi nlm nih,
4,https://journals.usm.ac.id/index.php/the-messe...,0,https,journals.usm.ac.id,/index.php/the-messenger/article/view/2282,,,,68,ac.id,4,5,1,1,journals usm,index php the messenger article view


In [17]:
domain_fake = url_data.groupby("Label").get_group(1)['domain_tokens']

In [18]:
domain_real = url_data.groupby("Label").get_group(0)['domain_tokens']

In [26]:
url_data_y = url_data['Label']
url_data.drop('Label', axis=1, inplace=True)
url_data.drop('Link', axis=1, inplace=True)
url_data.drop('scheme', axis=1, inplace=True)
url_data.drop('netloc', axis=1, inplace=True)
url_data.drop('path', axis=1, inplace=True)
url_data.drop('params', axis=1, inplace=True)
url_data.drop('query', axis=1, inplace=True)
url_data.drop('fragment', axis=1, inplace=True)

In [27]:
url_data.head(5)

Unnamed: 0,length,tld,digit,slashes,hypen,num_subdomains,domain_tokens,path_tokens
0,66,com,2,2,4,0,dailyhive,vancouver ryan reynolds covid vaccination
1,68,com,0,1,4,1,www livescience,china coronavirus death toll climbs html
2,45,com,3,2,4,0,cenerva,the g covid quackery
3,41,gov,8,2,0,3,pubmed ncbi nlm nih,
4,68,ac.id,4,5,1,1,journals usm,index php the messenger article view


In [28]:
with open("/content/drive/MyDrive/Master_Thesis_D_Amico/pickle/emotional_df_stack_groupped_pos.pkl", "rb") as f:
    dfemotion = pickle.load(f)

In [29]:
df_emo_sent = dfemotion.copy()
df_emo_sent = df_emo_sent[['Happy', 'Angry', 'Surprise', 'Sad', 'Fear', 'neg', 'neu', 'pos']]

In [30]:
df_emo_sent.head(5)

Unnamed: 0,Happy,Angry,Surprise,Sad,Fear,neg,neu,pos
0,0.0,0.0,0.33,0.33,0.33,0.0,0.833,0.167
1,0.0,0.0,0.33,0.33,0.33,0.316,0.684,0.0
2,0.0,0.0,0.17,0.17,0.67,0.257,0.743,0.0
3,0.0,0.0,0.0,0.25,0.75,0.0,0.827,0.173
4,0.0,0.0,0.25,0.25,0.5,0.0,1.0,0.0


In [31]:
df_final =  pd.concat([url_data, df_emo_sent], axis = 1)

In [32]:
df_final

Unnamed: 0,length,tld,digit,slashes,hypen,num_subdomains,domain_tokens,path_tokens,Happy,Angry,Surprise,Sad,Fear,neg,neu,pos
0,66,com,2,2,4,0,dailyhive,vancouver ryan reynolds covid vaccination,0.00,0.00,0.33,0.33,0.33,0.000,0.833,0.167
1,68,com,0,1,4,1,www livescience,china coronavirus death toll climbs html,0.00,0.00,0.33,0.33,0.33,0.316,0.684,0.000
2,45,com,3,2,4,0,cenerva,the g covid quackery,0.00,0.00,0.17,0.17,0.67,0.257,0.743,0.000
3,41,gov,8,2,0,3,pubmed ncbi nlm nih,,0.00,0.00,0.00,0.25,0.75,0.000,0.827,0.173
4,68,ac.id,4,5,1,1,journals usm,index php the messenger article view,0.00,0.00,0.25,0.25,0.50,0.000,1.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157,127,com,8,5,12,0,ohiocapitaljournal,she says vaccines make you magnetized this law...,0.00,0.00,0.00,0.33,0.67,0.489,0.511,0.000
1158,109,com,6,4,11,0,leadstories,hoax alert fake news bill gates did not help f...,0.00,0.00,0.50,0.00,0.50,0.000,0.621,0.379
1159,95,com,5,1,9,1,www businessinsider,wuhan coronavirus china patients plastic tube ...,0.00,0.33,0.00,0.00,0.67,0.196,0.804,0.000
1160,108,com,0,2,12,1,www wantedinrome,news italy likely to offer third dose of covid...,0.00,0.00,0.25,0.00,0.75,0.000,0.909,0.091


# Classification

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df_final, url_data_y, test_size=0.2, random_state = 1, stratify = url_data_y)

In [34]:
numeric_features = ['length', 'slashes', 'digit', 'hypen', 'Happy', 'Angry', 'Surprise', 'Sad', 'Fear', 'neg', 'neu', 'pos']
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_features = ['tld']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

vectorizer_features = ['domain_tokens','path_tokens', 'num_subdomains']
vectorizer_transformer = Pipeline(steps=[
    ('con', Converter()),
    ('tf', TfidfVectorizer())])

In [39]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('domvec', vectorizer_transformer, ['domain_tokens']),
        ('pathvec', vectorizer_transformer, ['path_tokens'])
    ])


In [425]:

svc_clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('smote', SMOTE()),
                      ('classifier', SVC(random_state=1, C = 1, kernel = 'rbf'))])

log_clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('smote', SMOTE()),
                      ('classifier', LogisticRegression(solver='lbfgs', C = 1, penalty = 'l2'))])

nb_clf = Pipeline(steps=[('preprocessor', preprocessor),
                         ('smote', SMOTE()),
                      ('classifier', MultinomialNB())])

rf_clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('smote', SMOTE()),
                      ('classifier', RandomForestClassifier(random_state=1, n_estimators = 200))])

In [426]:
svc_clf.fit(X_train, y_train)
log_clf.fit(X_train, y_train)
nb_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['length', 'slashes', 'digit',
                                                   'hypen', 'Happy', 'Angry',
                                                   'Surprise', 'Sad', 'Fear',
                                                   'neg', 'neu', 'pos']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['tld']),
                                                 ('domvec',
                                                  Pipeline(steps=[('con'

In [421]:
def results(name: str, model: BaseEstimator) -> None:
    preds = model.predict(X_test)

    print(name + " score: %.3f" % model.score(X_test, y_test))
    print(classification_report(y_test, preds))

In [427]:
results("SVC" , svc_clf)
results("Logistic Regression" , log_clf)
results("Naive Bayes" , nb_clf)
results("Random Forest", rf_clf)

SVC score: 0.940
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       206
           1       0.88      0.56      0.68        27

    accuracy                           0.94       233
   macro avg       0.91      0.77      0.82       233
weighted avg       0.94      0.94      0.93       233

Logistic Regression score: 0.931
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       206
           1       0.74      0.63      0.68        27

    accuracy                           0.93       233
   macro avg       0.85      0.80      0.82       233
weighted avg       0.93      0.93      0.93       233

Naive Bayes score: 0.893
              precision    recall  f1-score   support

           0       0.97      0.90      0.94       206
           1       0.52      0.81      0.64        27

    accuracy                           0.89       233
   macro avg       0.75      0.86      0.79       233

# TEST

In [69]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [73]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from unidecode import unidecode
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [74]:
def clean_text(text):

  text = text.lower()
  text = text.strip() # Remove spaces at the beginning and at the end of the string
  text = re.sub("([0-9]+(?:st|nd|rd|th)+)", " ", text) # Remove ordinal numbers
  text = re.sub("(?!5g)[^a-zA-Z]", " ", text) # Remove numbers and punctuation. Exclude 5g because it is part of our main search keyword
  text = re.sub("\s+", " ", text) # Remove extra space
  text = re.sub("\d+(\.\d+)?","num", text)
  text = re.sub("s/^\s+.*\s+$//", " ", text) # Remove leading and trailing whitespaces
  text = re.sub(r"\b[a-zA-Z]\b", " ", text) # Remove single characters

  text = unidecode(re.sub("\s+", " ", text.strip())) # Remove any additional whitespace
  text = text.strip()
  
  text = text.replace("numg", str('fiveg')) # Replace the trasformation of 5G -> numg -> with fiveg in order to appear in the df

  tokenized_texts = []

  document = word_tokenize(text)

  for token in document:
    if token not in stop_words and len(token) > 1:

      tokenized_texts.append(lemmatizer.lemmatize(token))
  
  tokenized_texts = ' '.join([w for w in tokenized_texts if len(w) > 2 ])

  return tokenized_texts

In [None]:
!pip install text2emotion
import text2emotion as te
def emotion_detection(sents):
    """Main algo for convertion for the 5 emotions """
    sent_emotion = te.get_emotion(sents) # prende il testo 
    return sent_emotion

In [102]:
# Generating the Emotions 
def generate_emotions(data_dup):
    
    """Use to generate the dataframe that appends the orginal text and the emotion label vector"""
    
    emotion_list = []
    for i, row in data_dup.iterrows():
        emotion_dict = emotion_detection(row[1])
        emotion_dict['Text_cleaned'] = row[1] 
        emotion_list.append(emotion_dict)
        
        
    emotion_df = pd.DataFrame(emotion_list)
    emotion_df = emotion_df.reset_index()
    data_dup = data_dup.reset_index()
    horizontal_stack = data_dup.merge(emotion_df, how='inner', on='index')
      
            
    return horizontal_stack

In [460]:
url_ = 'https://www.washingtonpost.com/world/asia_pacific/china-virus-surge-in-new-cases-raises-concerns-about-human-transmission-ahead-of-holiday-travel-season/2020/01/20/06d077fc-3b6a-11ea-971f-4ce4f94494b4_story.html'
# 'https://rumble.com/v10mnew-live-world-premiere-watch-the-water.html'

In [461]:
# importing the modules
from urllib.request import urlopen
from bs4 import BeautifulSoup

# using the BeautifulSoup module
soup = BeautifulSoup(urlopen(url_))
 
# displaying the title
print("Title of the website is : ")
print (soup.title.get_text())

Title of the website is : 
China virus: Coronavirus cases surge ahead of Spring Festival travel - The Washington Post


In [462]:
prova = [url_]
prova = pd.DataFrame(prova, columns = ['url'])
prova["parsed_url"] = prova.url.apply(parse_url)

In [463]:
prova['title'] = soup.title.get_text()

In [464]:
prova['title'] = prova['title'].apply(lambda x: clean_text(x))

In [465]:
prova = pd.concat([
    prova.drop(['parsed_url'], axis=1),
    prova['parsed_url'].apply(pd.Series)
], axis=1)
prova

Unnamed: 0,url,title,scheme,netloc,path,params,query,fragment
0,https://www.washingtonpost.com/world/asia_paci...,china virus coronavirus case surge ahead sprin...,https,www.washingtonpost.com,/world/asia_pacific/china-virus-surge-in-new-c...,,,


In [466]:
emotion_list = []
for i, row in prova.iterrows():
    emotion_dict = emotion_detection(row[1])
    emotion_list.append(emotion_dict)

In [467]:
emotion_list

[{'Angry': 0.0, 'Fear': 0.0, 'Happy': 0.0, 'Sad': 0.5, 'Surprise': 0.5}]

In [468]:
emotion_df = pd.DataFrame(emotion_list)

In [469]:
emotion_df

Unnamed: 0,Happy,Angry,Surprise,Sad,Fear
0,0.0,0.0,0.5,0.5,0.0


In [470]:
prova = pd.concat([prova, emotion_df], axis = 1)

In [471]:
prova

Unnamed: 0,url,title,scheme,netloc,path,params,query,fragment,Happy,Angry,Surprise,Sad,Fear
0,https://www.washingtonpost.com/world/asia_paci...,china virus coronavirus case surge ahead sprin...,https,www.washingtonpost.com,/world/asia_pacific/china-virus-surge-in-new-c...,,,,0.0,0.0,0.5,0.5,0.0


In [472]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [473]:
polarity_Score = []
for i, row in prova.iterrows():
    score = sid.polarity_scores(row[1])
    polarity_Score.append(score)

In [474]:
polarity_Score

[{'compound': 0.4939, 'neg': 0.0, 'neu': 0.758, 'pos': 0.242}]

In [475]:
polarity_Score = pd.DataFrame(polarity_Score)

In [476]:
prova = pd.concat([prova, polarity_Score], axis = 1)

In [477]:
prova = prova.drop(['compound'], axis = 1)

In [478]:
prova

Unnamed: 0,url,title,scheme,netloc,path,params,query,fragment,Happy,Angry,Surprise,Sad,Fear,neg,neu,pos
0,https://www.washingtonpost.com/world/asia_paci...,china virus coronavirus case surge ahead sprin...,https,www.washingtonpost.com,/world/asia_pacific/china-virus-surge-in-new-c...,,,,0.0,0.0,0.5,0.5,0.0,0.0,0.758,0.242


In [479]:
prova["length"] = prova.url.str.len()
prova["tld"] = prova.netloc.apply(lambda nl: tldextract.extract(nl).suffix)
prova['tld'] = prova['tld'].replace('','None')
prova['slashes'] = prova.path.str.count('/')
prova['digit'] = prova.url.str.count('\d')
prova['hypen'] = prova.url.str.count('-')

In [480]:
prova

Unnamed: 0,url,title,scheme,netloc,path,params,query,fragment,Happy,Angry,...,Sad,Fear,neg,neu,pos,length,tld,slashes,digit,hypen
0,https://www.washingtonpost.com/world/asia_paci...,china virus coronavirus case surge ahead sprin...,https,www.washingtonpost.com,/world/asia_pacific/china-virus-surge-in-new-c...,,,,0.0,0.0,...,0.5,0.0,0.0,0.758,0.242,211,com,7,28,19


In [482]:
prova['num_subdomains'] = prova['netloc'].apply(lambda net: get_num_subdomains(net))
prova['domain_tokens'] = prova['netloc'].apply(lambda net: tokenize_domain(net))
prova['path_tokens'] = prova['path'].apply(lambda path: " ".join(map(str,tokenizer.tokenize(path))))

In [483]:
prova.drop('url', axis=1, inplace=True)
prova.drop('scheme', axis=1, inplace=True)
prova.drop('netloc', axis=1, inplace=True)
prova.drop('path', axis=1, inplace=True)
prova.drop('params', axis=1, inplace=True)
prova.drop('query', axis=1, inplace=True)
prova.drop('fragment', axis=1, inplace=True)
prova.drop('title', axis = 1, inplace = True)

In [484]:
prova

Unnamed: 0,Happy,Angry,Surprise,Sad,Fear,neg,neu,pos,length,tld,slashes,digit,hypen,num_subdomains,domain_tokens,path_tokens
0,0.0,0.0,0.5,0.5,0.0,0.0,0.758,0.242,211,com,7,28,19,1,www washingtonpost,world asia pacific china virus surge in new ca...


In [485]:
for i in range(len(prova)):
  if prova['domain_tokens'][i] in list(domain_fake):
    print('Keep Attention!')
  elif prova['domain_tokens'][i] in list(domain_real):
    print('Ok, it is safe')
  else:
    print(svc_clf.predict(prova))

Ok, it is safe




---

