## Feature Selection

In [3]:
import os
import re
import nltk
import numpy as np
import pandas as pd
from sklearn import feature_extraction
from tqdm import tqdm

In [4]:
_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def gen_or_load_feats(feat_fn, headlines, bodies, feature_file):
    if not os.path.isfile(feature_file):
        feats = feat_fn(headlines, bodies)
        np.save(feature_file, feats)

    return np.load(feature_file)

In [151]:
def word_overlap_features(headlines, bodies):
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        clean_headline = get_tokenized_lemmas(clean_headline)
        clean_body = get_tokenized_lemmas(clean_body)
        features = [
            len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))]
        X.append(features)
    return X

In [34]:

def refuting_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        # 'refute',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_headline = get_tokenized_lemmas(clean_headline)
        features = [1 if word in clean_headline else 0 for word in _refuting_words]
        X.append(features)
    return X

In [35]:
def polarity_features(headlines, bodies):
    _refuting_words = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'deny', 'denies',
        'not',
        'despite',
        'nope',
        'doubt', 'doubts',
        'bogus',
        'debunk',
        'pranks',
        'retract'
    ]

    def calculate_polarity(text):
        tokens = get_tokenized_lemmas(text)
        return sum([t in _refuting_words for t in tokens]) % 2
    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        clean_headline = clean(headline)
        clean_body = clean(body)
        features = []
        features.append(calculate_polarity(clean_headline))
        features.append(calculate_polarity(clean_body))
        X.append(features)
    return np.array(X)

In [36]:

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output


def chargrams(input, n):
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

In [37]:

def append_chargrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
    grams_hits = 0
    grams_early_hits = 0
    grams_first_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
        if gram in text_body[:100]:
            grams_first_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    features.append(grams_first_hits)
    return features


def append_ngrams(features, text_headline, text_body, size):
    grams = [' '.join(x) for x in ngrams(text_headline, size)]
    grams_hits = 0
    grams_early_hits = 0
    for gram in grams:
        if gram in text_body:
            grams_hits += 1
        if gram in text_body[:255]:
            grams_early_hits += 1
    features.append(grams_hits)
    features.append(grams_early_hits)
    return features

In [38]:

def hand_features(headlines, bodies):

    def binary_co_occurence(headline, body):
        # Count how many times a token in the title
        # appears in the body text.
        bin_count = 0
        bin_count_early = 0
        for headline_token in clean(headline).split(" "):
            if headline_token in clean(body):
                bin_count += 1
            if headline_token in clean(body)[:255]:
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def binary_co_occurence_stops(headline, body):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count = 0
        bin_count_early = 0
        for headline_token in remove_stopwords(clean(headline).split(" ")):
            if headline_token in clean(body):
                bin_count += 1
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def count_grams(headline, body):
        # Count how many times an n-gram of the title
        # appears in the entire body, and intro paragraph

        clean_body = clean(body)
        clean_headline = clean(headline)
        features = []
        features = append_chargrams(features, clean_headline, clean_body, 2)
        features = append_chargrams(features, clean_headline, clean_body, 8)
        features = append_chargrams(features, clean_headline, clean_body, 4)
        features = append_chargrams(features, clean_headline, clean_body, 16)
        features = append_ngrams(features, clean_headline, clean_body, 2)
        features = append_ngrams(features, clean_headline, clean_body, 3)
        features = append_ngrams(features, clean_headline, clean_body, 4)
        features = append_ngrams(features, clean_headline, clean_body, 5)
        features = append_ngrams(features, clean_headline, clean_body, 6)
        return features

    X = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        X.append(binary_co_occurence(headline, body)
                 + binary_co_occurence_stops(headline, body)
                 + count_grams(headline, body))


    return X

## Fake News Challenge

In [39]:
import sys
import numpy as np
import random
import re
import argparse

from collections import defaultdict
from sklearn.ensemble import GradientBoostingClassifier
from csv import DictReader

#Import from other files
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
from utils.system import parse_params, check_version

### Reading Dataset

In [40]:
class DataSet():
    def __init__(self, name="train", path="fnc-1"):
        self.path = path

        print("Reading dataset")
        bodies = name+"_bodies.csv"
        stances = name+"_stances.csv"

        self.stances = self.read(stances)
        
        articles = self.read(bodies)
        self.articles = dict()

        #make the body ID an integer value
        for s in self.stances:
            s['Body ID'] = int(s['Body ID'])
        
        #copy all bodies into a dictionary
        for article in articles:
            self.articles[int(article['Body ID'])] = article['articleBody']

        print("Total stances: " + str(len(self.stances)))
        print("Total bodies: " + str(len(self.articles)))



    def read(self,filename):
        rows = []
        with open(self.path + "/" + filename, "rt", encoding='utf-8-sig') as table:
            r = DictReader(table)
            for line in r:
                rows.append(line)
        return rows

### Generate test splits 

In [41]:
def generate_hold_out_split (dataset, training = 0.8, base_dir="splits"):
    r = random.Random()
    r.seed(1489215)

    article_ids = list(dataset.articles.keys())  # get a list of article ids
    r.shuffle(article_ids)  # and shuffle that list


    training_ids = article_ids[:int(training * len(article_ids))]
    hold_out_ids = article_ids[int(training * len(article_ids)):]

    # write the split body ids out to files for future use
    with open(base_dir+ "/"+ "training_ids.txt", "w+") as f:
        f.write("\n".join([str(id) for id in training_ids]))

    with open(base_dir+ "/"+ "hold_out_ids.txt", "w+") as f:
        f.write("\n".join([str(id) for id in hold_out_ids]))



def read_ids(file,base):
    ids = []
    with open(base+"/"+file,"r") as f:
        for line in f:
           ids.append(int(line))
        return ids


def kfold_split(dataset, training = 0.8, n_folds = 10, base_dir="splits"):
    if not (os.path.exists(base_dir+ "/"+ "training_ids.txt")
            and os.path.exists(base_dir+ "/"+ "hold_out_ids.txt")):
        generate_hold_out_split(dataset,training,base_dir)

    training_ids = read_ids("training_ids.txt", base_dir)
    hold_out_ids = read_ids("hold_out_ids.txt", base_dir)

    folds = []
    for k in range(n_folds):
        folds.append(training_ids[int(k*len(training_ids)/n_folds):int((k+1)*len(training_ids)/n_folds)])

    return folds,hold_out_ids


def get_stances_for_folds(dataset,folds,hold_out):
    stances_folds = defaultdict(list)
    stances_hold_out = []
    for stance in dataset.stances:
        if stance['Body ID'] in hold_out:
            stances_hold_out.append(stance)
        else:
            fold_id = 0
            for fold in folds:
                if stance['Body ID'] in fold:
                    stances_folds[fold_id].append(stance)
                fold_id += 1

    return stances_folds,stances_hold_out

### Score

In [42]:
#Adapted from https://github.com/FakeNewsChallenge/fnc-1/blob/master/scorer.py
#Original credit - @bgalbraith

LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm


def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))


def report_score(actual,predicted):
    score,cm = score_submission(actual,predicted)
    best_score, _ = score_submission(actual,actual)

    print_confusion_matrix(cm)
    print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(score*100/best_score) + "%)")
    return score*100/best_score


if __name__ == "__main__":
    actual = [0,0,0,0,1,1,0,3,3]
    predicted = [0,0,0,0,1,1,2,3,3]

    report_score([LABELS[e] for e in actual],[LABELS[e] for e in predicted])

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     4     |     0     |     1     |     0     |
-------------------------------------------------------------
| disagree  |     0     |     2     |     0     |     0     |
-------------------------------------------------------------
|  discuss  |     0     |     0     |     0     |     0     |
-------------------------------------------------------------
| unrelated |     0     |     0     |     0     |     2     |
-------------------------------------------------------------
Score: 6.75 out of 7.5	(90.0%)


### Systems

In [43]:
def parse_params():
    parser = argparse.ArgumentParser(description='FakeNewsChallenge fnc-1-baseline')
    parser.add_argument('-c', '--clean-cache', action='store_true', default=False, help="clean cache files")
#    params = parser.parse_args()
#    if not params.clean_cache:
#        return

    dr = "features"
    for f in os.listdir(dr):
        if re.search('\.npy$', f):
            fname = os.path.join(dr, f)
            os.remove(fname)
    for f in ['hold_out_ids.txt', 'training_ids.txt']:
        fname = os.path.join('splits', f)
        if os.path.isfile(fname):
            os.remove(fname)
    print("All clear")

def check_version():
    if sys.version_info.major < 3:
        sys.stderr.write('Please use Python version 3 and above\n')
        sys.exit(1)

#### Generate Features

In [44]:
def generate_features(stances,dataset,name):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y



In [75]:
if __name__ == "__main__":
    check_version()
    parse_params()


    #Load the training dataset and generate folds
    d = DataSet()
    folds,hold_out = kfold_split(d,n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)
    
        # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
    for fold in fold_stances:
        Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))


    best_score = 0
    best_fold = None
    
# Classifier for each fold
## Uncomment this for loop code when you run this notebook for the first time. Takes over 30 mins to run.
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=False)
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score/max_fold_score

        print("Score for fold "+ str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf
#-----------------



    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    print("Scores on the dev set")
    report_score(actual,predicted)
    print("")
    print("")

    #Run on competition dataset
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
    actual = [LABELS[int(a)] for a in y_competition]

    print("Scores on the test set")
    report_score(actual,predicted)
            

All clear
Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset



0it [00:00, ?it/s][A
32it [00:00, 315.20it/s][A

Total stances: 25413
Total bodies: 904



65it [00:00, 318.38it/s][A
95it [00:00, 312.48it/s][A
134it [00:00, 331.81it/s][A
164it [00:00, 319.21it/s][A
201it [00:00, 331.76it/s][A
240it [00:00, 344.39it/s][A
273it [00:00, 326.92it/s][A
305it [00:00, 300.27it/s][A
337it [00:01, 304.45it/s][A
373it [00:01, 318.82it/s][A
405it [00:01, 315.25it/s][A
440it [00:01, 323.67it/s][A
473it [00:01, 310.86it/s][A
505it [00:01, 309.62it/s][A
543it [00:01, 326.62it/s][A
576it [00:01, 305.87it/s][A
611it [00:01, 315.63it/s][A
649it [00:02, 331.92it/s][A
683it [00:02, 314.95it/s][A
716it [00:02, 302.47it/s][A
750it [00:02, 312.03it/s][A
789it [00:02, 331.84it/s][A
828it [00:02, 347.34it/s][A
864it [00:02, 340.45it/s][A
899it [00:02, 335.27it/s][A
933it [00:02, 324.73it/s][A
969it [00:02, 332.85it/s][A
1003it [00:03, 295.19it/s][A
1034it [00:03, 285.73it/s][A
1064it [00:03, 272.67it/s][A
1092it [00:03, 267.80it/s][A
1121it [00:03, 272.98it/s][A
1149it [00:03, 243.34it/s][A
1179it [00:03, 257.82it/s][A
1207it [

18004it [00:59, 365.05it/s][A
18045it [00:59, 373.95it/s][A
18083it [00:59, 324.17it/s][A
18122it [00:59, 339.08it/s][A
18158it [01:00, 338.35it/s][A
18196it [01:00, 343.66it/s][A
18231it [01:00, 323.73it/s][A
18278it [01:00, 356.49it/s][A
18316it [01:00, 356.88it/s][A
18353it [01:00, 360.21it/s][A
18398it [01:00, 380.97it/s][A
18437it [01:00, 378.68it/s][A
18479it [01:00, 388.42it/s][A
18524it [01:00, 404.87it/s][A
18565it [01:01, 398.23it/s][A
18606it [01:01, 365.15it/s][A
18649it [01:01, 382.26it/s][A
18688it [01:01, 380.38it/s][A
18729it [01:01, 376.58it/s][A
18768it [01:01, 370.76it/s][A
18807it [01:01, 375.15it/s][A
18845it [01:01, 369.19it/s][A
18885it [01:01, 369.21it/s][A
18932it [01:02, 388.85it/s][A
18977it [01:02, 405.07it/s][A
19018it [01:02, 403.43it/s][A
19059it [01:02, 400.02it/s][A
19100it [01:02, 395.02it/s][A
19141it [01:02, 390.91it/s][A
19181it [01:02, 363.12it/s][A
19218it [01:02, 346.46it/s][A
19254it [01:02, 341.09it/s][A
19290it 

10426it [00:30, 313.22it/s][A
10458it [00:30, 306.07it/s][A
10493it [00:30, 317.79it/s][A
10535it [00:30, 342.67it/s][A
10571it [00:30, 342.75it/s][A
10616it [00:31, 368.51it/s][A
10654it [00:31, 342.38it/s][A
10690it [00:31, 347.34it/s][A
10726it [00:31, 336.00it/s][A
10761it [00:31, 331.02it/s][A
10797it [00:31, 338.32it/s][A
10832it [00:31, 336.87it/s][A
10868it [00:31, 343.29it/s][A
10903it [00:31, 332.07it/s][A
10937it [00:31, 321.79it/s][A
10970it [00:32, 308.51it/s][A
11009it [00:32, 329.10it/s][A
11053it [00:32, 354.23it/s][A
11090it [00:32, 352.90it/s][A
11126it [00:32, 350.47it/s][A
11163it [00:32, 348.35it/s][A
11201it [00:32, 356.64it/s][A
11237it [00:32, 341.22it/s][A
11273it [00:32, 345.50it/s][A
11308it [00:33, 320.65it/s][A
11345it [00:33, 328.19it/s][A
11383it [00:33, 339.92it/s][A
11418it [00:33, 338.98it/s][A
11453it [00:33, 335.21it/s][A
11493it [00:33, 351.59it/s][A
11531it [00:33, 359.56it/s][A
11568it [00:33, 358.10it/s][A
11605it 

1563it [00:08, 149.94it/s][A
1588it [00:08, 170.32it/s][A
1613it [00:08, 187.05it/s][A
1635it [00:08, 193.87it/s][A
1656it [00:08, 195.16it/s][A
1677it [00:08, 195.05it/s][A
1703it [00:08, 210.18it/s][A
1728it [00:08, 216.94it/s][A
1753it [00:08, 224.75it/s][A
1779it [00:09, 232.67it/s][A
1803it [00:09, 233.61it/s][A
1829it [00:09, 235.43it/s][A
1859it [00:09, 247.88it/s][A
1885it [00:09, 235.30it/s][A
1909it [00:09, 214.85it/s][A
1932it [00:09, 211.37it/s][A
1960it [00:09, 226.36it/s][A
1984it [00:09, 224.75it/s][A
2007it [00:10, 220.37it/s][A
2030it [00:10, 183.05it/s][A
2056it [00:10, 199.92it/s][A
2078it [00:10, 177.72it/s][A
2101it [00:10, 186.51it/s][A
2125it [00:10, 196.10it/s][A
2146it [00:10, 188.64it/s][A
2169it [00:10, 197.20it/s][A
2190it [00:11, 196.50it/s][A
2211it [00:11, 191.56it/s][A
2231it [00:11, 188.20it/s][A
2263it [00:11, 214.70it/s][A
2286it [00:11, 212.88it/s][A
2318it [00:11, 235.10it/s][A
2343it [00:11, 223.36it/s][A
2367it [00

16266it [01:08, 189.12it/s][A
16295it [01:08, 208.61it/s][A
16320it [01:08, 219.05it/s][A
16345it [01:08, 139.36it/s][A
16374it [01:08, 164.43it/s][A
16406it [01:09, 192.29it/s][A
16433it [01:09, 210.12it/s][A
16461it [01:09, 225.69it/s][A
16499it [01:09, 252.79it/s][A
16528it [01:09, 213.54it/s][A
16558it [01:09, 233.12it/s][A
16585it [01:09, 233.57it/s][A
16611it [01:09, 240.73it/s][A
16640it [01:09, 251.50it/s][A
16667it [01:10, 250.19it/s][A
16697it [01:10, 260.45it/s][A
16724it [01:10, 155.07it/s][A
16758it [01:10, 184.84it/s][A
16785it [01:10, 203.88it/s][A
16811it [01:10, 216.91it/s][A
16839it [01:10, 232.54it/s][A
16867it [01:11, 244.56it/s][A
16894it [01:11, 225.10it/s][A
16919it [01:11, 222.98it/s][A
16943it [01:11, 215.77it/s][A
16966it [01:11, 205.28it/s][A
16988it [01:11, 209.09it/s][A
17010it [01:11, 191.88it/s][A
17030it [01:11, 187.07it/s][A
17057it [01:11, 206.04it/s][A
17087it [01:12, 226.47it/s][A
17111it [01:12, 222.70it/s][A
17144it 

7519it [00:22, 323.91it/s][A
7553it [00:22, 320.50it/s][A
7589it [00:22, 330.38it/s][A
7626it [00:22, 340.96it/s][A
7661it [00:22, 317.88it/s][A
7694it [00:22, 306.77it/s][A
7726it [00:22, 283.31it/s][A
7756it [00:23, 278.44it/s][A
7786it [00:23, 282.80it/s][A
7815it [00:23, 283.30it/s][A
7852it [00:23, 303.06it/s][A
7883it [00:23, 296.03it/s][A
7914it [00:23, 290.85it/s][A
7946it [00:23, 298.58it/s][A
7977it [00:23, 296.70it/s][A
8009it [00:23, 297.77it/s][A
8048it [00:23, 318.40it/s][A
8081it [00:24, 320.32it/s][A
8118it [00:24, 333.56it/s][A
8159it [00:24, 352.17it/s][A
8197it [00:24, 354.82it/s][A
8233it [00:24, 348.24it/s][A
8269it [00:24, 345.66it/s][A
8309it [00:24, 357.49it/s][A
8346it [00:24, 352.68it/s][A
8389it [00:24, 372.35it/s][A
8427it [00:25, 372.19it/s][A
8465it [00:25, 357.78it/s][A
8502it [00:25, 356.72it/s][A
8538it [00:25, 345.61it/s][A
8575it [00:25, 352.38it/s][A
8611it [00:25, 340.51it/s][A
8648it [00:25, 344.80it/s][A
8683it [00

3707it [00:17, 219.21it/s][A
3730it [00:17, 216.71it/s][A
3755it [00:17, 225.66it/s][A
3784it [00:17, 240.16it/s][A
3809it [00:18, 241.57it/s][A
3836it [00:18, 248.06it/s][A
3862it [00:18, 232.87it/s][A
3886it [00:18, 233.98it/s][A
3910it [00:18, 232.89it/s][A
3934it [00:18, 224.37it/s][A
3957it [00:18, 217.99it/s][A
3984it [00:18, 229.40it/s][A
4009it [00:18, 231.07it/s][A
4033it [00:19, 225.88it/s][A
4056it [00:19, 226.35it/s][A
4080it [00:19, 224.99it/s][A
4104it [00:19, 228.76it/s][A
4127it [00:19, 210.51it/s][A
4150it [00:19, 215.22it/s][A
4172it [00:19, 210.68it/s][A
4194it [00:19, 209.15it/s][A
4217it [00:19, 211.47it/s][A
4247it [00:19, 230.20it/s][A
4271it [00:20, 219.54it/s][A
4294it [00:20, 220.14it/s][A
4321it [00:20, 231.34it/s][A
4345it [00:20, 231.26it/s][A
4370it [00:20, 228.71it/s][A
4394it [00:20, 230.81it/s][A
4419it [00:20, 235.92it/s][A
4452it [00:20, 254.97it/s][A
4479it [00:20, 250.63it/s][A
4508it [00:21, 260.53it/s][A
4535it [00

491it [00:02, 220.59it/s][A
514it [00:02, 204.75it/s][A
539it [00:02, 210.56it/s][A
564it [00:02, 219.28it/s][A
587it [00:02, 203.89it/s][A
611it [00:02, 212.54it/s][A
633it [00:02, 203.14it/s][A
657it [00:02, 211.38it/s][A
679it [00:03, 196.11it/s][A
700it [00:03, 198.85it/s][A
724it [00:03, 209.43it/s][A
746it [00:03, 212.33it/s][A
768it [00:03, 206.17it/s][A
793it [00:03, 215.66it/s][A
816it [00:03, 212.77it/s][A
838it [00:03, 197.80it/s][A
867it [00:03, 214.84it/s][A
890it [00:03, 211.92it/s][A
913it [00:04, 215.78it/s][A
935it [00:04, 216.66it/s][A
957it [00:04, 211.65it/s][A
981it [00:04, 208.10it/s][A
1005it [00:04, 216.12it/s][A
1027it [00:04, 201.46it/s][A
1048it [00:04, 203.04it/s][A
1078it [00:04, 224.75it/s][A
1102it [00:04, 207.51it/s][A
1124it [00:05, 207.61it/s][A
1146it [00:05, 184.27it/s][A
1175it [00:05, 202.32it/s][A
1197it [00:05, 202.23it/s][A
1219it [00:05, 201.93it/s][A
1247it [00:05, 216.38it/s][A
1272it [00:05, 217.61it/s][A
12

1844it [00:09, 154.11it/s][A
1862it [00:09, 136.45it/s][A
1878it [00:09, 139.66it/s][A
1897it [00:09, 148.93it/s][A
1916it [00:09, 157.61it/s][A
1933it [00:09, 146.00it/s][A
1954it [00:09, 158.03it/s][A
1975it [00:10, 170.67it/s][A
2009it [00:10, 197.42it/s][A
2036it [00:10, 206.83it/s][A
2062it [00:10, 218.78it/s][A
2090it [00:10, 229.92it/s][A
2115it [00:10, 220.64it/s][A
2138it [00:10, 210.56it/s][A
2164it [00:10, 220.48it/s][A
2193it [00:10, 237.19it/s][A
2218it [00:11, 234.40it/s][A
2242it [00:11, 233.19it/s][A
2266it [00:11, 227.67it/s][A
2293it [00:11, 236.89it/s][A
2323it [00:11, 252.26it/s][A
2349it [00:11, 253.66it/s][A
2375it [00:11, 253.81it/s][A
2401it [00:11, 227.85it/s][A
2425it [00:11, 209.18it/s][A
2453it [00:12, 225.93it/s][A
2477it [00:12, 228.94it/s][A
2507it [00:12, 246.12it/s][A
2534it [00:12, 252.78it/s][A
2563it [00:12, 261.62it/s][A
2590it [00:12, 252.58it/s][A
2616it [00:12, 246.03it/s][A
2645it [00:12, 253.47it/s][A
2671it [00

2688it [00:10, 345.63it/s][A
2725it [00:10, 351.79it/s][A
2763it [00:10, 356.50it/s][A
2799it [00:10, 347.99it/s][A
2837it [00:10, 356.36it/s][A
2873it [00:10, 328.87it/s][A
2910it [00:10, 338.52it/s][A
2945it [00:10, 330.71it/s][A
2979it [00:11, 324.16it/s][A
3012it [00:11, 302.36it/s][A
3047it [00:11, 314.49it/s][A
3079it [00:11, 308.83it/s][A
3111it [00:11, 303.61it/s][A
3148it [00:11, 313.48it/s][A
3184it [00:11, 325.62it/s][A
3219it [00:11, 327.23it/s][A
3253it [00:11, 330.60it/s][A
3287it [00:12, 323.80it/s][A
3320it [00:12, 317.59it/s][A
3352it [00:12, 316.74it/s][A
3384it [00:12, 316.65it/s][A
3388it [00:12, 274.32it/s][A
0it [00:00, ?it/s][A
642it [00:00, 6419.23it/s][A
1292it [00:00, 6441.12it/s][A
1925it [00:00, 6405.14it/s][A
2529it [00:00, 6290.49it/s][A
3151it [00:00, 6267.02it/s][A
3388it [00:00, 6271.81it/s][A
0it [00:00, ?it/s][A
40it [00:00, 393.67it/s][A
76it [00:00, 379.95it/s][A
107it [00:00, 355.07it/s][A
133it [00:00, 312.90it/s]

1541it [00:06, 225.73it/s][A
1565it [00:06, 224.59it/s][A
1590it [00:07, 230.54it/s][A
1616it [00:07, 238.65it/s][A
1644it [00:07, 249.12it/s][A
1670it [00:07, 248.45it/s][A
1696it [00:07, 238.97it/s][A
1721it [00:07, 218.36it/s][A
1744it [00:07, 217.59it/s][A
1773it [00:07, 230.49it/s][A
1801it [00:07, 242.75it/s][A
1826it [00:07, 242.81it/s][A
1851it [00:08, 235.14it/s][A
1877it [00:08, 240.43it/s][A
1903it [00:08, 245.14it/s][A
1934it [00:08, 261.51it/s][A
1962it [00:08, 264.80it/s][A
1989it [00:08, 249.69it/s][A
2016it [00:08, 254.79it/s][A
2052it [00:08, 278.58it/s][A
2081it [00:08, 271.16it/s][A
2109it [00:09, 264.36it/s][A
2136it [00:09, 265.54it/s][A
2163it [00:09, 260.25it/s][A
2190it [00:09, 258.58it/s][A
2217it [00:09, 253.71it/s][A
2248it [00:09, 264.95it/s][A
2276it [00:09, 260.73it/s][A
2303it [00:09, 255.06it/s][A
2329it [00:09, 255.12it/s][A
2360it [00:10, 267.27it/s][A
2387it [00:10, 240.14it/s][A
2412it [00:10, 241.45it/s][A
2447it [00

160it [00:00, 169.83it/s][A
177it [00:01, 148.92it/s][A
194it [00:01, 153.78it/s][A
218it [00:01, 171.56it/s][A
249it [00:01, 197.47it/s][A
288it [00:01, 230.95it/s][A
318it [00:01, 247.68it/s][A
349it [00:01, 262.82it/s][A
378it [00:01, 256.62it/s][A
411it [00:01, 273.33it/s][A
445it [00:01, 289.76it/s][A
476it [00:02, 282.50it/s][A
506it [00:02, 271.72it/s][A
540it [00:02, 287.82it/s][A
570it [00:02, 284.54it/s][A
601it [00:02, 290.48it/s][A
631it [00:02, 278.88it/s][A
661it [00:02, 281.13it/s][A
690it [00:02, 269.11it/s][A
718it [00:02, 258.77it/s][A
745it [00:03, 259.07it/s][A
782it [00:03, 280.25it/s][A
811it [00:03, 249.13it/s][A
838it [00:03, 253.74it/s][A
868it [00:03, 266.02it/s][A
897it [00:03, 272.16it/s][A
925it [00:03, 251.86it/s][A
956it [00:03, 265.33it/s][A
984it [00:03, 264.66it/s][A
1011it [00:04, 256.15it/s][A
1037it [00:04, 245.84it/s][A
1070it [00:04, 265.51it/s][A
1098it [00:04, 256.61it/s][A
1131it [00:04, 274.69it/s][A
1167it [0

2955it [00:08, 334.82it/s][A
2991it [00:08, 340.42it/s][A
3026it [00:08, 337.03it/s][A
3061it [00:08, 329.23it/s][A
3095it [00:08, 319.91it/s][A
3128it [00:08, 310.91it/s][A
3160it [00:08, 303.91it/s][A
3191it [00:08, 305.01it/s][A
3222it [00:09, 292.48it/s][A
3257it [00:09, 306.75it/s][A
3292it [00:09, 317.07it/s][A
3325it [00:09, 316.10it/s][A
3365it [00:09, 336.30it/s][A
3400it [00:09, 337.43it/s][A
3435it [00:09, 325.86it/s][A
3468it [00:09, 316.70it/s][A
3508it [00:09, 337.26it/s][A
3548it [00:09, 352.26it/s][A
3588it [00:10, 364.57it/s][A
3626it [00:10, 364.22it/s][A
3669it [00:10, 378.47it/s][A
3710it [00:10, 386.81it/s][A
3750it [00:10, 365.69it/s][A
3790it [00:10, 374.64it/s][A
3828it [00:10, 364.43it/s][A
3872it [00:10, 382.63it/s][A
3913it [00:10, 389.40it/s][A
3953it [00:11, 392.40it/s][A
3999it [00:11, 407.91it/s][A
4041it [00:11, 395.31it/s][A
4081it [00:11, 371.49it/s][A
4119it [00:11, 364.67it/s][A
4156it [00:11, 351.41it/s][A
4192it [00

3142it [00:12, 275.70it/s][A
3172it [00:12, 281.60it/s][A
3201it [00:12, 226.38it/s][A
3226it [00:12, 229.15it/s][A
3251it [00:12, 233.94it/s][A
3276it [00:12, 219.90it/s][A
3306it [00:12, 239.05it/s][A
3344it [00:13, 267.66it/s][A
3375it [00:13, 278.30it/s][A
3405it [00:13, 267.36it/s][A
3443it [00:13, 289.89it/s][A
3474it [00:13, 277.72it/s][A
3503it [00:13, 266.80it/s][A
3531it [00:13, 269.03it/s][A
3565it [00:13, 280.37it/s][A
3594it [00:13, 267.27it/s][A
3622it [00:14, 252.26it/s][A
3649it [00:14, 256.14it/s][A
3675it [00:14, 253.34it/s][A
3701it [00:14, 251.05it/s][A
3727it [00:14, 237.00it/s][A
3751it [00:14, 227.73it/s][A
3775it [00:14, 176.18it/s][A
3807it [00:14, 203.37it/s][A
3845it [00:15, 235.20it/s][A
3881it [00:15, 262.07it/s][A
3918it [00:15, 286.21it/s][A
3953it [00:15, 300.47it/s][A
3986it [00:15, 293.83it/s][A
4018it [00:15, 257.08it/s][A
4039it [00:15, 256.89it/s][A
0it [00:00, ?it/s][A
30it [00:00, 260.14it/s][A
55it [00:00, 251.26i

3916it [00:15, 264.50it/s][A
3944it [00:15, 261.37it/s][A
[A

Score for fold 6 was - 0.7740591783970123
Score for fold 0 was - 0.790634959548909
Score for fold 7 was - 0.8065337293169283
Score for fold 5 was - 0.7642120765832106
Score for fold 2 was - 0.8175341669089852
Score for fold 8 was - 0.820952380952381
Score for fold 9 was - 0.7873290538654758
Score for fold 3 was - 0.8108217514505465
Score for fold 1 was - 0.7939656376588909
Score for fold 4 was - 0.7953927600515095
Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    118    |     3     |    556    |    85     |
-------------------------------------------------------------
| disagree  |    14     |     3     |    130    |    15     |
-------------------------------------------------------------
|  discuss  |    58     |     5     |   1527    |    210    |
-------------------------------------------------------------
| 

## Generating features for the Fake News dataset for Stance Detection 

In [175]:
def generate_fn_features(dataset,name):
    h, b = [],[]

    for d in dataset:
        h.append(d[0]) #title
        b.append(d[1]) #text

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X

In [176]:
df_Orig = pd.read_csv('fake_real_dataset.csv')
df = pd.read_csv('fake_real_dataset.csv')

In [177]:
import math
def title_column(tuple1):
    #print(tuple1[2])
    if(type(tuple1[0]) == float or type(tuple1[0]) == int):
        if(math.isnan(tuple1[0])):
            tuple1[0] = ''
            
    if(pd.notna(tuple1[0])):
        if(tuple1[0].strip(' \t\n\r') == ''):
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[1])
        else:
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[0])
    else:
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[0])
        
        
def text_column(tuple1):
    #print(tuple1[2])
    if(type(tuple1[1]) == float or type(tuple1[1]) == int):
        if(math.isnan(tuple1[1])):
            tuple1[1] = ''
            
    if(pd.notna(tuple1[1])):
        if(tuple1[1].strip(' \t\n\r') == ''):
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[0])
        else:
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[1]) 
    else:
            return re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", tuple1[1])


df['title'] = df[['title', 'text','uuid']].apply(title_column, axis=1)
df['text'] = df[['title', 'text', 'uuid']].apply(text_column, axis=1)

In [178]:
fn_dataset = df[['title', 'text']].head(100)
fn_dataset.shape

(100, 2)

In [179]:
fn_dataset = fn_dataset.values
len(fn_dataset)

100

In [180]:
X_fn = generate_fn_features(fn_dataset, "FakeNewsStance")


0it [00:00, ?it/s][A
35it [00:00, 348.52it/s][A
46it [00:00, 200.27it/s][A
59it [00:00, 171.10it/s][A
74it [00:00, 162.34it/s][A
89it [00:00, 144.76it/s][A
100it [00:00, 179.11it/s][A
0it [00:00, ?it/s][A
100it [00:00, 1095.84it/s][A
0it [00:00, ?it/s][A
37it [00:00, 349.21it/s][A
49it [00:00, 215.01it/s][A
60it [00:00, 154.96it/s][A
71it [00:00, 130.99it/s][A
89it [00:00, 127.32it/s][A
100it [00:00, 162.59it/s][A
0it [00:00, ?it/s][A
14it [00:00, 135.92it/s][A
37it [00:00, 152.51it/s][A
47it [00:00, 116.16it/s][A
56it [00:00, 99.02it/s] [A
65it [00:00, 70.94it/s][A
73it [00:00, 48.79it/s][A
79it [00:01, 26.58it/s][A
84it [00:01, 25.84it/s][A
89it [00:13,  1.39it/s][A
100it [00:13,  7.56it/s][A

In [181]:
X_fn.shape

(100, 44)

In [182]:
X_fn

array([[6.00000000e+00, 4.00000000e+00, 4.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 7.59493671e-02],
       [1.00000000e+01, 1.00000000e+01, 7.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 5.35714286e-02],
       [9.00000000e+00, 9.00000000e+00, 7.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 6.08108108e-02],
       ...,
       [6.10000000e+01, 4.70000000e+01, 4.10000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [6.10000000e+01, 4.70000000e+01, 4.10000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [183]:
fn_predicted = [LABELS[int(a)] for a in best_fold.predict(X_fn)]

In [184]:
fn_predicted

['discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'discuss',
 'agree',
 'discuss',
 'discuss',
 'discuss',
 'agree',
 'discuss',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'disagree',
 'agree',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'agree',
 'discuss',
 'unrelated',
 'discuss',
 'agree',
 'agree',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'agree',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'agree',
 'discuss',
 'discuss',
 'unrelated',
 'discuss',
 'discuss',
 'discuss',
 'discuss',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'discuss',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'agree',
 'discuss',
 'agree',
 'discuss',
 'agree',

In [186]:
fn_predicted = np.asarray(fn_predicted)

In [189]:
df['stance'] = np.nan
for i in range(100):
        df['stance'][i] = fn_predicted[i]
#df['stance'] = fn_predicted[:,0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [191]:
df.head(100)

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,...,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type,stance
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED They Stole Millions In Gov t...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,...,Muslims BUSTED: They Stole Millions In Gov‚Äôt...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias,discuss
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,...,Re: Why Did Attorney General Loretta Lynch Ple...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias,discuss
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING Weiner Cooperating With FBI On Hilla...,Red State Fox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,...,BREAKING: Weiner Cooperating With FBI On Hilla...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias,discuss
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,...,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias,discuss
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,...,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias,unrelated
5,8f30f5ea14c9d5914a9fe4f55ab2581772af4c31,0,Barracuda Brigade,2016-11-02T16:31:28.550+02:00,Hillary Goes Absolutely Berserk On Protester A...,Print Hillary goes absolutely berserk! She exp...,english,2016-11-02T16:31:28.550+02:00,100percentfedup.com,US,...,Hillary Goes Absolutely Berserk On Protester A...,0.000,http://bb4sp.com/wp-content/uploads/2016/11/Fu...,0,1,0,0,0,bias,discuss
6,d3cc0fe38f41a59f7c48f8c3528ca5f74193148f,0,Fed Up,2016-11-04T19:40:00.000+02:00,BREAKING! NYPD Ready To Make Arrests In Weiner...,BREAKING! NYPD Ready To Make Arrests In Weiner...,english,2016-11-05T02:13:46.065+02:00,100percentfedup.com,US,...,BREAKING! NYPD Ready To Make Arrests In Weiner...,0.701,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias,agree
7,b4bbf8b5c19e8864f5257832a58b81ef4ed2d4e4,0,Fed Up,2016-11-05T01:19:00.000+02:00,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,BREAKING! NYPD Ready To Make Arrests In Weiner...,english,2016-11-05T05:59:07.458+02:00,100percentfedup.com,US,...,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,0.188,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias,discuss
8,a19aabaa5a61eb8bc22fadaaa003e5fbba5c4bf6,0,Fed Up,2016-11-06T23:54:00.000+02:00,BREAKING CLINTON CLEARED Was This A Coordin...,Limbaugh said that the revelations in the Wi...,english,2016-11-07T10:20:06.409+02:00,100percentfedup.com,US,...,BREAKING: CLINTON CLEARED...Was This A Coordin...,0.144,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias,discuss
9,f54d8e13010d0a79893995ee65360ad4b38b5a35,0,Fed Up,2016-11-07T02:43:00.000+02:00,EVIL HILLARY SUPPORTERS Yell F ck Trump Bu...,Email These people are sick and evil They w...,english,2016-11-07T10:20:27.252+02:00,100percentfedup.com,US,...,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""‚Ä¶Bu...",0.995,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias,discuss


In [192]:
df.to_csv('fake_real_dataset_spam_category_clickbait_toxicity_politafln_stance.csv')

In [265]:
#     # Load the Fake News Original dataset
# #    competition_dataset = DataSet("competition_test")
# #    X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")
    
# #    dataset = ["title","text"]
    
#     fn_dataset = df[['title', 'text']]
    
#     fn_dataset = fn_dataset.values
    
#     X_fn = generate_fn_features(fn_dataset, "fakenews")
    
#     # Predict on Fake News Original dataset
#     fn_predicted = [LABELS[int(a)] for a in best_fold.predict(X_fn)]