In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize, sent_tokenize
from string import punctuation
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd drive/MyDrive

/content/drive/MyDrive


In [4]:
columns = ['id','label','text','subject','speaker','job title','state info','party','barely true','false','half true','mostly true','pants on fire','context']
label_map = {'pants-fire':-3, 'false':-2, 'barely-true':-1, 'half-true':1, 'mostly-true':2, 'true':3}

In [5]:
train = pd.read_csv('../data/liar_dataset/raw_data/train.tsv',sep='\t',header=None, names=columns)
# print(len(train))#,train.isna().sum())
train['label'] = train['label'].map(label_map)
train.drop(index=train[train.subject.isna()].index, inplace=True)
train.drop(index=train[train.speaker.isna()].index, inplace=True)
train.drop(index=train[train.text==' '].index, inplace=True)
train.drop(index=train[train.text=='  '].index, inplace=True)
train.drop(index=train[train.text=='\n'].index, inplace=True)
train.drop(columns=['context'],inplace=True)
train = train.reset_index()
train.count()

index            10238
id               10238
label            10238
text             10238
subject          10238
speaker          10238
job title         7343
state info        8032
party            10238
barely true      10238
false            10238
half true        10238
mostly true      10238
pants on fire    10238
dtype: int64

In [6]:
valid = pd.read_csv('../data/liar_dataset/raw_data/valid.tsv',sep='\t',header=None, names=columns)
print(len(valid))
valid['label'] = valid['label'].map(label_map)
# valid.dropna(inplace=True)
valid.drop(index=valid[valid.subject.isna()].index, inplace=True)
valid.drop(index=valid[valid.speaker.isna()].index, inplace=True)
valid.drop(index=valid[valid.text==' '].index, inplace=True)
valid.drop(index=valid[valid.text=='  '].index, inplace=True)
valid.drop(index=valid[valid.text=='\n'].index, inplace=True)
valid.drop(columns=['context'],inplace=True)
valid.count()

1284


id               1284
label            1284
text             1284
subject          1284
speaker          1284
job title         939
state info       1005
party            1284
barely true      1284
false            1284
half true        1284
mostly true      1284
pants on fire    1284
dtype: int64

In [7]:
test = pd.read_csv('../data/liar_dataset/raw_data/test.tsv',sep='\t',header=None, names=columns)
print(len(test))
test['label'] = test['label'].map(label_map)
# valid.dropna(inplace=True)
test.drop(index=test[test.subject.isna()].index, inplace=True)
test.drop(index=test[test.speaker.isna()].index, inplace=True)
test.drop(index=test[test.text==' '].index, inplace=True)
test.drop(index=test[test.text=='  '].index, inplace=True)
test.drop(index=test[test.text=='\n'].index, inplace=True)
test.drop(columns=['context'],inplace=True)
test.count()

1267


id               1267
label            1267
text             1267
subject          1267
speaker          1267
job title         942
state info       1005
party            1267
barely true      1267
false            1267
half true        1267
mostly true      1267
pants on fire    1267
dtype: int64

In [8]:
print('real news count')
train[train['label']>0].count()

real news count


index            5752
id               5752
label            5752
text             5752
subject          5752
speaker          5752
job title        4264
state info       4663
party            5752
barely true      5752
false            5752
half true        5752
mostly true      5752
pants on fire    5752
dtype: int64

In [9]:
stopWords = stopwords.words('english')

def preprocessing(raw_text):
    # print(raw_text)
    tokenized = []
    for sent in raw_text:
        texts = [''.join([c for c in text.lower() if c not in punctuation]) for text in sent]
        texts = ''.join(texts)
        texts = [''.join([c for c in text.lower() if c not in '’']) for text in texts]
        texts = ''.join(texts)
        texts = [''.join([c for c in text.lower() if c not in '‘']) for text in texts]
        texts = ''.join(texts)
        texts = [word for word in word_tokenize(texts)] # if word not in stopWords]
        # print(texts)
        tokenized.append(texts)
    # texts = ' '.join(texts)
    # print(texts)
    return tokenized

In [10]:
def getVocab(text, vocab):
    for txt in text:
        for w in txt:
            vocab.add(w)
    return vocab

In [11]:
def wordVec(text, vocab):
    word_dict = {}
    ind = 0
    for word in vocab:
        word_dict[word] = ind
        ind += 1
    word_vector = []
    for txt in text:
        w_vec = []
        for word in txt:
            w_vec.append(word_dict[word])
        word_vector.append(w_vec)
    return word_vector

In [12]:
def padding(seq, maxlen):
    final = []
    for lis in seq:
#         print(lis, '\n***\n')
        # padding
        if len(lis)<maxlen:
            pad = []
            # print(lis)
            for i in range(maxlen-len(lis)):
                if type(lis[0]) == int:
                    pad.append(0)
                else:
                    pad.append([0 for i in range(len(lis[0]))])
            for i in range(len(lis)):
                pad.append(lis[i])
            final.append(pad)
        #truncating
        else:
            trunc = []
            for i in range(maxlen):
                trunc.append(lis[i])
            final.append(trunc)
    return final

In [13]:
def remove_nan(word_seq, data, threshold=0.8):
  df = pd.DataFrame(word_seq)
  cosine_corr = cosine_similarity(df, df)
  for i in range(len(cosine_corr)):
    cosine_corr[i][i] = -1
  idx_cosine_similarity = np.argmax(cosine_corr, axis=1)
  
  index_nan = data[data['job title'].isna()].index
  for i in index_nan:
    if cosine_corr[i][idx_cosine_similarity[i]]>threshold:
      data.at[i,'job title'] = data['job title'][idx_cosine_similarity[i]]
    
  index_nan = data[data['state info'].isna()].index
  for i in index_nan:
    if cosine_corr[i][idx_cosine_similarity[i]]>threshold:
      data.at[i,'state info'] = data['state info'][idx_cosine_similarity[i]]
    
  idx = set()
  temp_idx1 = data[data['job title'].isna()].index
  temp_idx2 = data[data['state info'].isna()].index
  idx.update(temp_idx1)
  idx.update(temp_idx2)
  
  # print(len(idx))
  data.dropna(inplace=True)
  data = data.reset_index(drop=True)
  # print(data.count())
  
  return data, np.delete(word_seq,list(idx),0)


# text

In [14]:
train_text = np.array(train['text'])
test_text = np.array(test['text'])
valid_text = np.array(valid['text'])

In [15]:
train_tokens = preprocessing(train_text)
test_tokens = preprocessing(test_text)
valid_tokens = preprocessing(valid_text)

In [16]:
count = [len(train_tokens[i]) for i in range(len(train_tokens))]
pd.DataFrame(count).describe()

Unnamed: 0,0
count,10238.0
mean,17.935827
std,9.581478
min,2.0
25%,12.0
50%,17.0
75%,22.0
max,464.0


In [17]:
## get vocab

vocab = set()
vocab = getVocab(train_tokens, vocab)
vocab = getVocab(test_tokens, vocab)
vocab = getVocab(valid_tokens, vocab)
vocab = list(vocab)
vocab.sort()
len(vocab)

14957

In [18]:
# words

## convert to vectors 
word_seq_train = wordVec(train_tokens, vocab)
word_seq_test = wordVec(test_tokens, vocab)
word_seq_valid = wordVec(valid_tokens, vocab)

## padding
word_seq_train = np.array(padding(word_seq_train, maxlen=1200), dtype='float32')
word_seq_test = np.array(padding(word_seq_test, maxlen=1200), dtype='float32')
word_seq_valid = np.array(padding(word_seq_valid, maxlen=1200), dtype='float32')

threshold=0.8
# replace nan values in stateinfo, job title with the values from most similar text
train, word_seq_train = remove_nan(word_seq_train, train, threshold)
test, word_seq_test = remove_nan(word_seq_test, test, threshold)
valid, word_seq_valid = remove_nan(word_seq_valid, valid, threshold)

In [19]:
## saving as csv
pd.DataFrame(word_seq_train).to_csv('../data/liar_dataset/text_seq_data/word_seq_train.csv', index=False)
pd.DataFrame(word_seq_test).to_csv('../data/liar_dataset/text_seq_data/word_seq_test.csv', index=False)
pd.DataFrame(word_seq_valid).to_csv('../data/liar_dataset/text_seq_data/word_seq_valid.csv', index=False)


# label

In [22]:
train_label = np.array(train.label)
test_label = np.array(test.label)
valid_label = np.array(valid.label)

pd.DataFrame(train_label,columns=['label']).to_csv('../data/liar_dataset/label_seq_data/train_label.csv', index=False)
pd.DataFrame(test_label,columns=['label']).to_csv('../data/liar_dataset/label_seq_data/test_label.csv', index=False)
pd.DataFrame(valid_label,columns=['label']).to_csv('../data/liar_dataset/label_seq_data/valid_label.csv', index=False)

# meta data

In [23]:
columns = train.columns
columns = columns.drop(['label','text'])
train_meta = np.array(train[columns])
pd.DataFrame(train_meta,columns=columns).to_csv('../data/liar_dataset/meta_data/train_meta.csv', index=False)

columns = test.columns
columns = columns.drop(['label','text'])
test_meta = np.array(test[columns])
pd.DataFrame(test_meta,columns=columns).to_csv('../data/liar_dataset/meta_data/test_meta.csv', index=False)

columns = valid.columns
columns = columns.drop(['label','text'])
valid_meta = np.array(valid[columns])
pd.DataFrame(valid_meta,columns=columns).to_csv('../data/liar_dataset/meta_data/valid_meta.csv', index=False)