In [36]:
import numpy as np
import pandas as pd
from nltk import word_tokenize, sent_tokenize
from string import punctuation
from nltk.corpus import stopwords

In [74]:
df = pd.read_csv('../data/fake-news/train.csv')

In [113]:
train = df[:int(0.6*len(df))].copy()
train.dropna(inplace=True)
train.drop(index=train[train.text==' '].index, inplace=True)
train.drop(index=train[train.text=='  '].index, inplace=True)
train.drop(index=train[train.text=='\n'].index, inplace=True)
train.count()

id        10942
title     10942
author    10942
text      10942
label     10942
dtype: int64

In [114]:
valid = df[int(0.6*len(df)):int(0.8*len(df))].copy()
valid.dropna(inplace=True)
valid.drop(index=valid[valid.text==' '].index, inplace=True)
valid.drop(index=valid[valid.text=='  '].index, inplace=True)
valid.drop(index=valid[valid.text=='\n'].index, inplace=True)
valid.count()

id        3617
title     3617
author    3617
text      3617
label     3617
dtype: int64

In [115]:
test = df[int(0.8*len(df)):].copy()
test.dropna(inplace=True)
test.drop(index=test[test.text==' '].index, inplace=True)
test.drop(index=test[test.text=='  '].index, inplace=True)
test.drop(index=test[test.text=='\n'].index, inplace=True)
test.count()

id        3652
title     3652
author    3652
text      3652
label     3652
dtype: int64

In [100]:
print('real news count')
train[train['label']==0].count()

real news count


id        6220
title     6220
author    6220
text      6220
label     6220
dtype: int64

In [101]:
stopWords = stopwords.words('english')

def preprocessing(raw_text):
    # print(raw_text)
    tokenized = []
    for sent in raw_text:
        texts = [''.join([c for c in text.lower() if c not in punctuation]) for text in sent]
        texts = ''.join(texts)
        texts = [''.join([c for c in text.lower() if c not in '’']) for text in texts]
        texts = ''.join(texts)
        texts = [''.join([c for c in text.lower() if c not in '‘']) for text in texts]
        texts = ''.join(texts)
        texts = [word for word in word_tokenize(texts)] # if word not in stopWords]
        # print(texts)
        tokenized.append(texts)
    # texts = ' '.join(texts)
    # print(texts)
    return tokenized

In [102]:
def getVocab(text, vocab):
    for txt in text:
        for w in txt:
            vocab.add(w)
    return vocab

In [103]:
def wordVec(text, vocab):
    word_dict = {}
    ind = 0
    for word in vocab:
        word_dict[word] = ind
        ind += 1
    word_vector = []
    for txt in text:
        w_vec = []
        for word in txt:
            w_vec.append(word_dict[word])
        word_vector.append(w_vec)
    return word_vector

In [104]:
def padding(seq, maxlen=1200):
    final = []
    for lis in seq:
#         print(lis, '\n***\n')
        # padding
        if len(lis)<maxlen:
            pad = []
            # print(lis)
            for i in range(maxlen-len(lis)):
                if type(lis[0]) == int:
                    pad.append(0)
                else:
                    pad.append([0 for i in range(len(lis[0]))])
            for i in range(len(lis)):
                pad.append(lis[i])
            final.append(pad)
        #truncating
        else:
            trunc = []
            for i in range(maxlen):
                trunc.append(lis[i])
            final.append(trunc)
    return final

In [116]:
train_text = np.array(train['text'])
test_text = np.array(test['text'])
valid_text = np.array(valid['text'])

In [117]:
train_tokens = preprocessing(train_text)
test_tokens = preprocessing(test_text)
valid_tokens = preprocessing(valid_text)

In [118]:
count = [len(train_tokens[i]) for i in range(len(train_tokens))]
pd.DataFrame(count).describe()

Unnamed: 0,0
count,10942.0
mean,825.937671
std,912.151624
min,1.0
25%,313.0
50%,628.0
75%,1117.0
max,21024.0


In [119]:
## get vocab

vocab = set()
vocab = getVocab(train_tokens, vocab)
vocab = getVocab(test_tokens, vocab)
vocab = getVocab(valid_tokens, vocab)
vocab = list(vocab)
vocab.sort()
len(vocab)

193874

In [120]:
# words

## convert to vectors 

word_seq_train = wordVec(train_tokens, vocab)
word_seq_test = wordVec(test_tokens, vocab)
word_seq_valid = wordVec(valid_tokens, vocab)
print(len(word_seq_valid))

## padding

word_seq_train = np.array(padding(word_seq_train), dtype='float32')
word_seq_test = np.array(padding(word_seq_test), dtype='float32')
word_seq_valid = np.array(padding(word_seq_valid), dtype='float32')
print(word_seq_valid.shape)

## saving as csv

pd.DataFrame(word_seq_train).to_csv('../data/fake-news/seq_data/word_seq_train.csv', index=False)
pd.DataFrame(word_seq_test).to_csv('../data/fake-news//seq_data/word_seq_test.csv', index=False)
pd.DataFrame(word_seq_valid).to_csv('../data/fake-news//seq_data/word_seq_valid.csv', index=False)

3617
(3617, 1200)


In [121]:
train_label = np.array(train.label)
test_label = np.array(test.label)
valid_label = np.array(valid.label)

pd.DataFrame(train_label).to_csv('../data/fake-news/seq_data/train_label.csv', index=False)
pd.DataFrame(test_label).to_csv('../data/fake-news//seq_data/test_label.csv', index=False)
pd.DataFrame(valid_label).to_csv('../data/fake-news//seq_data/valid_label.csv', index=False)