In [22]:
from nela_features.nela_features import NELAFeatureExtractor
import numpy as np
import pandas as pd
import re
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split 

In [23]:
fake_data=pd.read_csv("dataset/Fake.csv")
real_data=pd.read_csv("dataset/True.csv")
nela = NELAFeatureExtractor()
glove_embeddings = {}
print("Loading glove embeddings")
with open('glove.6B.100d.txt', 'r',encoding='utf8') as f:
    for line in tqdm(f):
        values = line.split(' ')
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        glove_embeddings[word] = vector
test_embeddings = [
    # Socioeconomic status
    {'name': 'rich/poor', 'dir1': ["rich", "wealthy", "affluent"], "dir2": ["poor", "impoverished", "destitute"]},
    
    # Age bias
    {'name': 'young/old', 'dir1': ["young", "youthful", "vibrant"], "dir2": ["old", "elderly", "aged"]},
    
    # Gender stereotypes (roles)
    {'name': 'male/female stereotypes', 'dir1': ["leader", "strong", "assertive"], "dir2": ["nurturing", "caring", "supportive"]},
    
    # Rural vs. Urban bias
    {'name': 'rural/urban', 'dir1': ["urban", "city"], "dir2": ["rural", "countryside"]},
    
    # Employment bias (white-collar vs. blue-collar)
    {'name': 'white-collar/blue-collar', 'dir1': ["professional", "educated", "executive"], "dir2": ["manual", "laborer", "working-class"]},
    
    # Intelligence perception
    {'name': 'smart/dumb', 'dir1': ["smart", "intelligent"], 'dir2': ["dumb", "stupid"]},
]


Loading glove embeddings


0it [00:00, ?it/s]

In [24]:
def find_embedding_features(inp, glove_embeddings):
    # Find the average embedding of the sentence
    words = inp.split()
    embedding = np.zeros(len(glove_embeddings['the']))
    
    for word in words:
        if word.lower() in glove_embeddings:
            embedding += glove_embeddings[word.lower()]
    embedding /= len(words)
    
    # Now find all cosine similarities to the difference between dir1 and dir2
    embedding_features = []
    embedding_names = []
    for test_embedding in test_embeddings:
        net_dir = np.zeros(len(glove_embeddings['the']))
        for word in test_embedding['dir1']:
            net_dir += glove_embeddings[word]
        for word in test_embedding['dir2']:
            net_dir -= glove_embeddings[word]
        net_dir /= len(test_embedding['dir1']) + len(test_embedding['dir2'])
        
        # Find the cosine similarity
        cos_sim = np.dot(embedding, net_dir) / (np.linalg.norm(embedding) * np.linalg.norm(net_dir))

        embedding_features.append(cos_sim)
        embedding_names.append(test_embedding['name'])
        
    return embedding_features, embedding_names

In [25]:
def filter_short(df):
    df["text"] = df.text.astype(str)
    df["WordsCount"]=df["text"].apply(lambda x: len(re.sub('[^a-zA-Z]', ' ', x)))
    df=df[(df["WordsCount"]>= 5)]
    if 'id' in df.columns:
        df = df[df['id'].str.isnumeric()]
    df = df[df['text'].str.isnumeric() == False]
    df = df[df['text'] != None]
    df = df[df['text'] != ' ']
    df = df.drop(["WordsCount"], axis=1)
    return df


fake_data["label"]="fake"
fake_data = filter_short(fake_data)
real_data["label"]="real"
real_data = filter_short(real_data)

real_data = real_data.sample(1000)
fake_data = fake_data.sample(1000)


final_data= pd.concat([fake_data,real_data])

final_data = final_data.drop(["subject","date"], axis=1)
final_data=final_data[["text","label"]]
final_data['label'] = final_data['label'].map({'real':1, 'fake':0})

In [26]:
final_data = final_data.values

In [27]:
def just_vector(text):
    feature_vector, feature_names = nela.extract_all(text)
    return feature_vector
def embedding_vector(text):
    embedding_features, embedding_names = find_embedding_features(text, glove_embeddings)
    return embedding_features

In [28]:
final_data = [[just_vector(x) + embedding_vector(x),y] for x,y in tqdm(final_data)]

  0%|          | 0/2000 [00:00<?, ?it/s]

  cos_sim = np.dot(embedding, net_dir) / (np.linalg.norm(embedding) * np.linalg.norm(net_dir))


In [29]:
data = [x[0]+[x[1]] for x in final_data]


In [34]:
print(np.array(data).shape)

(2000, 94)


In [35]:
feature_vector, feature_names = nela.extract_all("hello")
embedding_features, embedding_names = find_embedding_features("hello", glove_embeddings)
df = pd.DataFrame(data, columns=feature_names+embedding_names+['label'])

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split([x[0] for x in final_data], [x[1] for x in final_data], stratify = [x[1] for x in final_data])

In [37]:
train = [x[0]+[x[1]] for x in list(zip(X_train,Y_train))]

In [38]:
test = [x[0]+[x[1]] for x in list(zip(X_test,Y_test))]

In [40]:
train_df = pd.DataFrame(train, columns=feature_names+embedding_names+['label'])
test_df = pd.DataFrame(test, columns=feature_names+embedding_names+['label'])

In [41]:
train_df.to_csv("dataset/fakenewstrain.csv")
test_df.to_csv("dataset/fakenewstest.csv")