In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize, sent_tokenize
from string import punctuation
import torch.optim as optim
from nltk.corpus import stopwords
import string

In [34]:
train = pd.read_csv('../data/fake-news/train.csv')
test = pd.read_csv('../data/fake-news/test.csv')

In [35]:
valid = train[int(0.8*len(train)):].copy()
valid.dropna(inplace=True)
valid.count()

id        3668
title     3668
author    3668
text      3668
label     3668
dtype: int64

In [36]:
train = train[:int(0.8*len(train))].copy()
train.dropna(inplace=True)
train.count()

id        14617
title     14617
author    14617
text      14617
label     14617
dtype: int64

In [37]:
test.dropna(inplace=True)
test.count()

id        4575
title     4575
author    4575
text      4575
dtype: int64

In [38]:
print('real news count')
train[train['label']==0].count()

real news count


id        8319
title     8319
author    8319
text      8319
label     8319
dtype: int64

In [40]:
stopWords = stopwords.words('english')

def preprocessing(raw_text):
    texts = [''.join([c for c in text.lower() if c not in punctuation]) for text in raw_text]
    texts = ''.join(texts)
    texts = [''.join([c for c in text.lower() if c not in '’']) for text in texts]
    texts = ''.join(texts)
    texts = [word for word in word_tokenize(texts) if word not in stopWords]
    texts = ' '.join(texts)
    return texts

In [46]:
def getTokens(text):
    tokenized_list = []
    for txt in text:
        tokenized = sent_tokenize(txt)
        for i in tokenized:
            wordsList = nltk.word_tokenize(i)
            wordsList = [w.lower() for w in wordsList] 
            tokenized_list.append(wordsList)
    return tokenized_list

In [26]:
def getVocab(text, vocab):
    for txt in text:
        for w in txt:
            vocab.add(w)
    return vocab

In [27]:
def wordVec(text, vocab):
    word_dict = {}
    ind = 0
    for word in vocab:
        word_dict[word] = ind
        ind += 1
    word_vector = []
    for txt in text:
        w_vec = []
        for word in txt:
            w_vec.append(word_dict[word])
        word_vector.append(w_vec)
    return word_vector

In [59]:
def padding(seq, maxlen=80):
    final = []
    for lis in seq:
#         print(lis, '\n***\n')
        # padding
        if len(lis)<maxlen:
            pad = []
            for i in range(maxlen-len(lis)):
                if type(lis[0]) == int:
                    pad.append(0)
                else:
                    pad.append([0 for i in range(len(lis[0]))])
            for i in range(len(lis)):
                pad.append(lis[i])
            final.append(pad)
        #truncating
        else:
            trunc = []
            for i in range(maxlen):
                trunc.append(lis[i])
            final.append(trunc)
    return final

In [41]:
train['title'] = train['title'].apply(preprocessing)
train['title'] = train['title'].astype(str)

test['title'] = test['title'].apply(preprocessing)
test['title'] = test['title'].astype(str)

valid['title'] = valid['title'].apply(preprocessing)
valid['title'] = valid['title'].astype(str)

In [52]:
train_title = np.array(train['title'])
test_title = np.array(test['title'])
valid_title = np.array(valid['title'])

In [58]:
count = [len(train_title[i]) for i in range(len(train_title))]
pd.DataFrame(count).describe()

Unnamed: 0,0
count,14617.0
mean,60.230622
std,19.146751
min,0.0
25%,49.0
50%,61.0
75%,72.0
max,361.0


In [54]:
train_tokens = getTokens(train_title)
test_tokens = getTokens(test_title)
valid_tokens = getTokens(valid_title)

In [55]:
## get vocab

vocab = set()
vocab = getVocab(train_tokens, vocab)
vocab = getVocab(test_tokens, vocab)
vocab = getVocab(valid_tokens, vocab)
vocab = list(vocab)
vocab.sort()
len(vocab)

25248

In [60]:
# words

## convert to vectors 

word_seq_train = wordVec(train_tokens, vocab)
word_seq_test = wordVec(test_tokens, vocab)
word_seq_valid = wordVec(valid_tokens, vocab)

## padding

word_seq_train = np.array(padding(word_seq_train), dtype='float32')
word_seq_test = np.array(padding(word_seq_test), dtype='float32')
word_seq_valid = np.array(padding(word_seq_valid), dtype='float32')

## saving as csv

pd.DataFrame(word_seq_train).to_csv('../data/fake-news/seq_data/word_seq_train.csv', index=False)
pd.DataFrame(word_seq_test).to_csv('../data/fake-news//seq_data/word_seq_test.csv', index=False)
pd.DataFrame(word_seq_valid).to_csv('../data/fake-news//seq_data/word_seq_valid.csv', index=False)