# Fasttext exploration

In [1]:
import os
os.chdir("/Users/antonioloison/Projects/fakenewsdetec")
os.getcwd()

'/Users/antonioloison/Projects/fakenewsdetec'

In [2]:
import pandas as pd
import plotly.express as px
import math
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from typing import Dict, List
import pprint

In [3]:
train_news = pd.read_csv("data/train_berkeley_1.csv")
val_news = pd.read_csv("data/eval_berkeley_1.csv")
test_news = pd.read_csv("data/test_berkeley_1.csv")
train_news = train_news[["title", "text", "label"]]
train_news["processed_text"] = train_news["text"]
train_news.shape

(4138, 4)

# Preprocess

## Clean Reuters bias

## Clean text

In [4]:
# NLP Preprocessing
from gensim.utils import simple_preprocess

unable to import 'smart_open.gcs', disabling that module


In [5]:
train_news["processed_text"] = train_news["text"].apply(lambda x: ' '.join(simple_preprocess(x)))
test_news["processed_text"] = test_news["text"].apply(lambda x: ' '.join(simple_preprocess(x)))

In [6]:
train_news["processed_label"] = train_news["label"].apply(lambda x: '__label__' + str(x))
test_news["processed_label"] = test_news["label"].apply(lambda x: '__label__' + str(x))


In [7]:
train_news.head()

Unnamed: 0,title,text,label,processed_text,processed_label
0,Memorial Day provides respite from VA controve...,Memorial Day is a time to remember those who g...,0,memorial day is time to remember those who gav...,__label__0
1,"Marco Rubio, announcing 2016 campaign, focuses...",It’s not that Americans won’t elect wealthy pr...,0,it not that americans won elect wealthy presid...,__label__0
2,Prepare Yourself For The Higher Energies,Leave a reply \nDylan Harper – Everything and ...,1,leave reply dylan harper everything and everyo...,__label__1
3,Place your bets now. How much does someone’s w...,Dan Kahan and his team at the Yale Law School’...,0,dan kahan and his team at the yale law school ...,__label__0
4,Gay marriage ruling leaves debate about religi...,The Supreme Court made a number of important d...,0,the supreme court made number of important dec...,__label__0


In [8]:
# Saving the CSV file as a text file to train/test the classifier
train_news[['processed_label', 'processed_text']].to_csv('data/train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

test_news[['processed_label', 'processed_text']].to_csv('data/test.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

# Train model

In [9]:
import fasttext

# Skipgram model :
model = fasttext.train_supervised('data/train.txt')

# Test model

In [10]:
model.test('data/test.txt')

(887, 0.7316798196166855, 0.7316798196166855)

In [11]:
print(test_news.iloc[0,-2])
#print(list(test_df.iloc[:3,-1]))
preds = model.predict(list(test_news.iloc[:3,-1]))[0]
[int(pred[0].split("__")[2]) for pred in preds]

top five clinton donors are jewish how anti semitic is this fact published october source moon of alabama top five clinton donors are jewish campaign tally shows something is wrong with the above statement isn it anti semitic did trump say that readers of that statement may assume somewhat reasonably that there is club of rich jewish people controlling the clinton campaign and maybe clinton herself that sounds like it was taken from the fake protocols of the elders of zion it clearly must be anti semitic it is also true facts have no bias they can be anti semitic or can they but while facts as such can not have racial religious bias openly stating them surely can thus the above statement is anti semitic the fact itself isn bad reporting it publicly is bad bad bad who but an alt right rag would report such at all and for what purpose if not for spreading anti semitism well quot licet jovi jewish papers are of course allowed to report such fact that isn anti semitic it is solely to brag 

[1, 1, 1]

In [12]:
def predict(model, test_data: pd.DataFrame):
    texts = list(test_data["processed_text"])
    print(len(texts))
    #print(texts[:10])
    predictions = model.predict(texts)[0]
    return [int(pred[0].split("__")[2]) for pred in predictions]

In [13]:
preds = predict(model, test_news)

887


In [14]:
labels = list(test_news["label"])

In [20]:
train_news["label"].value_counts()

0    2085
1    2053
Name: label, dtype: int64

In [15]:
from fakenewsdetec.model.fasttext_classifier import FasttextClassifier

In [16]:
clf = FasttextClassifier({"saved_model_path": "",
                          "train": "True"},
                         train_news,
                         val_news,
                         test_news)

In [17]:
clf.train()

In [18]:
clf.compute_metrics()

{'train_metrics': {'accuracy': 0.7535041082648622,
  'recall': 0.5776911836337068,
  'precision': 0.8857356235997013,
  'f1-score': 0.6992924528301887,
  'support': [2085, 2053]},
 'validation_metrics': {'accuracy': 0.7440811724915445,
  'recall': 0.5570469798657718,
  'precision': 0.89568345323741,
  'f1-score': 0.6868965517241379,
  'support': [440, 447]},
 'test_metrics': {'accuracy': 0.7373167981961668,
  'recall': 0.5375586854460094,
  'precision': 0.8641509433962264,
  'f1-score': 0.662807525325615,
  'support': [461, 426]}}

**Difficult dataset!**

# Gensim Fasttext model

In [46]:
os.path.exists("/Volumes/Elements SE/crawl-300d-2M-subword.bin")

False

In [47]:
from gensim.models.wrappers import FastText
from gensim.models.fasttext import load_facebook_model

In [48]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("/Users/antonioloison/Documents/wiki-news-300d-1M-subword.vec")

In [49]:
print(model.most_similar('desk'))

[('desks', 0.8139737844467163), ('desk-', 0.8030417561531067), ('desk.', 0.778192400932312), ('front-desk', 0.7296270132064819), ('ref-desk', 0.7272905111312866), ('deskside', 0.7197455167770386), ('help-desk', 0.715452253818512), ('writing-desk', 0.7056628465652466), ('refdesk', 0.6872211694717407), ('Desk', 0.6861226558685303)]


In [51]:
words = []
for word in model.vocab:
    words.append(word)
print(len(words))

999994


In [53]:
print("Vector components of a word: {}".format(
    len(model[words[0]])
))

Vector components of a word: 300


In [54]:
import numpy as np
 
def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
    
    return np.asarray(sent_vec) / numw

In [55]:
V=[]
for sentence in list():
    V.append(sent_vectorizer(sentence, model)) 

NameError: name 'sentences' is not defined