# Creating a misinformation detection model using TF-IDF vectorization 
1. Compile training data from Kaggle dataset of fake and real news articles. Use the article urls to fetch the article text
2. Clean and tokenize the data
3. Create TF-IDF vectors to use in the model
4. Evaluate the performance of different classification models using the TF-IDF model
5. Implement the optimal model on the YouTube captions dataset

In [20]:
import pandas as pd

import json
import logging
import time

import requests
from newspaper import Article

import re 
import nltk.corpus
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier

### Fetch both real and fake news articles to train the model

In [5]:
fake_df = pd.read_csv("../src/model/politifact_fake.csv")
real_df = pd.read_csv("../src/model/politifact_real.csv")

### Gather article texts 

In [6]:
def crawl_link_article(url):
    result_json = None

    try:
        if 'http' not in url:
            if url[0] == '/':
                url = url[1:]
            try:
                article = Article('http://' + url)
                article.download()
                time.sleep(2)
                article.parse()
                flag = True
            except:
                flag = False
                pass
            if flag == False:
                try:
                    article = Article('https://' + url)
                    article.download()
                    time.sleep(2)
                    article.parse()
                    flag = True
                except:
                    flag = False
                    pass
            if flag == False:
                return None
        else:
            try:
                article = Article(url)
                article.download()
                time.sleep(2)
                article.parse()
            except:
                return None

        if not article.is_parsed:
            return None

        visible_text = article.text
        top_image = article.top_image
        images = article.images
        keywords = article.keywords
        authors = article.authors
        canonical_link = article.canonical_link
        title = article.title
        meta_data = article.meta_data
        movies = article.movies
        publish_date = article.publish_date
        source = article.source_url
        summary = article.summary

        result_json = {'url': url, 'text': visible_text, 'images': list(images), 'top_img': top_image,
                       'keywords': keywords,
                       'authors': authors, 'canonical_link': canonical_link, 'title': title, 'meta_data': meta_data,
                       'movies': movies, 'publish_date': get_epoch_time(publish_date), 'source': source,
                       'summary': summary}
    except:
        return None

    return visible_text

In [7]:
def get_epoch_time(time_obj):
    if time_obj:
        return time_obj.timestamp()

    return None

In [8]:
def get_web_archieve_results(search_url):
    try:
        archieve_url = "http://web.archive.org/cdx/search/cdx?url={}&output=json".format(search_url)

        response = requests.get(archieve_url)
        response_json = json.loads(response.content)

        response_json = response_json[1:]

        return response_json

    except:
        return None

In [9]:
def get_website_url_from_arhieve(url):
    """ Get the url from http://web.archive.org/ for the passed url if exists."""
    archieve_results = get_web_archieve_results(url)
    if archieve_results:
        modified_url = "https://web.archive.org/web/{}/{}".format(archieve_results[0][1], archieve_results[0][2])
        return modified_url
    else:
        return None

In [10]:
def crawl_news_article(url):
    news_article = crawl_link_article(url)

    # If the news article could not be fetched from original website, fetch from archieve if it exists.
    if news_article is None:
        archieve_url = get_website_url_from_arhieve(url)
        if archieve_url is not None:
            news_article = crawl_link_article(archieve_url)

    return news_article

In [11]:
fake_df["article_text"] = fake_df["news_url"].apply(crawl_news_article)
fake_df["label"] = "fake"

In [12]:
fake_df.head()

Unnamed: 0,id,news_url,title,tweet_ids,article_text,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,,fake
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,"The West Texas Federal Appeals Court, operatin...",fake
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,Read original article here\n\nLiberals sure ar...,fake
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,The former Paralympic athlete reportedly tried...,fake
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,It's possible that Travis Barker's new profile...,fake


In [13]:
real_df["article_text"] = real_df["news_url"].apply(crawl_news_article)
real_df["label"] = "real"

news_df = pd.concat([fake_df, real_df])

In [15]:
def text_cleaning(video_caption):
    if video_caption is not None:
        # normalize case
        video_caption = video_caption.lower()
        # remove punctuation
        video_caption = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", video_caption)
        # remove stopwords
        stop = stopwords.words('english')
        video_caption = " ".join([word for word in video_caption.split() if word not in (stop)])
        # lemmenization
        wn = nltk.WordNetLemmatizer()
        video_caption = " ".join([wn.lemmatize(word) for word in video_caption.split()])
    return video_caption

def create_model_df(row):
    row = row[["article_text", "title", "label"]]
    row["article_text"] = text_cleaning(row["article_text"])
    row["title"] = text_cleaning(row["title"])
    row["label"] = int(row["label"] == "real")
    return row

model_data = news_df.apply(create_model_df, axis=1).dropna().reset_index(drop=True)
Data = model_data["article_text"]
Target = model_data["label"]

### Create TF-IDF vectors for every text

In [18]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(Data)
freq_term_matrix = count_vectorizer.transform(Data)
tfidf = TfidfTransformer(norm = "l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

### Test/Train split of the data

In [21]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, Target, test_size=0.2, random_state=0, stratify=Target)

### Evaluate Logistic Regression, Naive Bayes, Descision Tree, and Passive Agressive Classifiers to find the best model 

In [37]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Accuracy = logreg.score(X_test, y_test)
Accuracy

0.7048780487804879

In [27]:
NB = MultinomialNB()
NB.fit(X_train, y_train)
Accuracy = NB.score(X_test, y_test)
Accuracy

0.6341463414634146

In [28]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
Accuracy = clf.score(X_test, y_test)
Accuracy

0.7853658536585366

In [34]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train,y_train)
y_pred=pac.predict(X_test)
Accuracy=accuracy_score(y_test,y_pred)
Accuracy

0.7902439024390244

### Use the best model (Passive Agressive Classifier) to classify YouTube captions as misinformation or not

In [None]:
video_captions = pd.read_csv("../data/caption_data.csv")
vid_data  = video_captions.dropna()['text'].apply(text_cleaning)

count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(vid_data)
freq_term_matrix = count_vectorizer.transform(vid_data)
tfidf = TfidfTransformer(norm = "l2")
tfidf.fit(freq_term_matrix)
vid_tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

vid_pred = pac.predict(vid_tf_idf_matrix)
misinformation_pct = 1 - (sum(vid_pred) / len(vid_pred))

print(f'The percentage of YouTube captions that were classified as misinformation is {misinformation_pct}')