In [None]:
import pandas as pd
import os.path
import pickle 
import numpy as np
import keras.utils
import time
from keras.callbacks import TensorBoard, CSVLogger
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Flatten,LSTM,Conv1D,GlobalMaxPool1D,Dropout,Bidirectional
# from keras.layers.embedding import Embedding
from keras import optimizers
from keras.layers import Input
from keras.models import Model
from keras.utils.vis_utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.models import load_model
from nltk.corpus import stopwords
import operator

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pydot==1.2.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydot==1.2.3
  Downloading pydot-1.2.3.tar.gz (20 kB)
Building wheels for collected packages: pydot
  Building wheel for pydot (setup.py) ... [?25l[?25hdone
  Created wheel for pydot: filename=pydot-1.2.3-py3-none-any.whl size=18941 sha256=82813089c72d7b60c83cbec4c8b9f5007ea04199300a41c652d5fc1ae74f20e5
  Stored in directory: /root/.cache/pip/wheels/97/f3/4f/2614983209399831c4b278ae354b87b35cdc070703c5c8611d
Successfully built pydot
Installing collected packages: pydot
  Attempting uninstall: pydot
    Found existing installation: pydot 1.3.0
    Uninstalling pydot-1.3.0:
      Successfully uninstalled pydot-1.3.0
Successfully installed pydot-1.2.3


In [None]:
train_data=pd.read_csv("/content/drive/MyDrive/F3_FineGrained_Fake_News_Detection_train.csv")
test_data=pd.read_csv("/content/drive/MyDrive/F3_FineGrained_Fake_News_Detection_test.csv")

In [None]:
### Renaming some columns
train_data['party']=train_data['party affiliation']
train_data['job']=train_data['speakers job title']
train_data['state']=train_data['state info']

test_data['party']=test_data['party affiliation']
test_data['job']=test_data['speakers job title']
test_data['state']=test_data['state info']

train_data=train_data.drop('party affiliation',axis=1)
train_data=train_data.drop('speakers job title',axis=1)
train_data=train_data.drop('state info',axis=1)

test_data=test_data.drop('party affiliation',axis=1)
test_data=test_data.drop('speakers job title',axis=1)
test_data=test_data.drop('state info',axis=1)

In [None]:
label_enc = {"pants-fire" : 0, "false" : 1, "barely-true" : 2, "half-true" : 3, "mostly-true" : 4, "true" : 5}
train_data['output'] = train_data['label'].apply(lambda x: label_enc[x])



In [None]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()


train_data['speaker_id']= label_encoder.fit_transform(train_data['speaker'])
train_data['state_id']= label_encoder.fit_transform(train_data['state'])
train_data['job_id']= label_encoder.fit_transform(train_data['job'])
train_data['party_id']= label_encoder.fit_transform(train_data['party'])
train_data['subject_id']= label_encoder.fit_transform(train_data['subject'])

test_data['speaker_id']= label_encoder.fit_transform(test_data['speaker'])
test_data['state_id']= label_encoder.fit_transform(test_data['state'])
test_data['job_id']= label_encoder.fit_transform(test_data['job'])
test_data['party_id']= label_encoder.fit_transform(test_data['party'])
test_data['subject_id']= label_encoder.fit_transform(test_data['subject'])



In [None]:
from sklearn.model_selection import train_test_split
train_data,val_data = train_test_split(train_data, test_size=0.2, random_state = 0)


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#encoding  statements
def get_vocab_dict(train_data):
  vocab_dict = {}
  if not os.path.exists('vocabulary.p'):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data['statement'])
    vocab_dict = tokenizer.word_index
    pickle.dump(vocab_dict, open( "vocabulary.p", "wb" ))
    
  else:
    vocab_dict = pickle.load(open("vocabulary.p", "rb" ))

  return vocab_dict

## removing stopwords
def preprocessing(statement):
  statement = [word for word in statement.split(' ') if word not in stopwords.words('english')]
  statement = ' '.join(statement)
  text = text_to_word_sequence(statement)
  val = [0] * 10
  val = [vocab_dict[t] for t in text if t in vocab_dict] 
  return val


vocab_dict = get_vocab_dict(train_data)
train_data['word_id'] = train_data['statement'].apply(preprocessing)
val_data['word_id'] = val_data['statement'].apply(preprocessing)
test_data['word_id'] = test_data['statement'].apply(preprocessing)


In [None]:
import spacy
!python -m spacy download en
nlp = spacy.load("en_core_web_sm")


[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 31.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
### pos tagging
pos_tags = {'ADJ': 'adjective', 'ADP': 'adposition', 'ADV': 'adverb', 
            'AUX': 'auxiliary verb', 'CONJ': 'coordinating conjunction', 
            'DET': 'determiner', 'INTJ': 'interjection', 'NOUN': 'noun', 
            'NUM': 'numeral', 'PART': 'particle', 'PRON': 'pronoun', 
            'PROPN': 'proper noun', 'PUNCT': 'punctuation', 'X': 'other', 
            'SCONJ': 'subord conjunction', 'SYM': 'symbol', 'VERB': 'verb'}

pos_dict = {'NOUN' : 0, 'VERB' : 1, 'ADP' : 2, 'PROPN' : 3, 'PUNCT' : 4, 
            'DET' : 5, 'ADJ' : 6, 'NUM' : 7, 'ADV' : 8, 'PRON' : 9, 'X' : 9, 
            'PART' : 9, 'SYM' : 9, 'INTJ' : 9 }

dep_dict = {'ACL' : 0, 'ACOMP' : 1, 'ADVCL' : 2, 'ADVMOD' : 3, 'AGENT' : 4, 
            'AMOD' : 5, 'APPOS' : 6, 'ATTR' : 7, 'AUX' : 8, 'AUXPASS' : 9, 
            'CASE' : 10, 'CC' : 11, 'CCOMP' : 12, 'COMPOUND' : 13, 'CONJ' : 14, 
            'CSUBJ' : 15, 'CSUBJPASS' : 16, 'DATIVE' : 17, 'DEP' : 18, 
            'DET' : 19, 'DOBJ' : 20, 'EXPL' : 21, 'INTJ' : 22, 'MARK' : 23, 
            'META' : 24, 'NEG' : 25, 'NOUNMOD' : 26, 'NPMOD' : 27, 'NSUBJ' : 28, 
            'NSUBJPASS' : 29, 'NUMMOD' : 30, 'OPRD' : 31, 'PARATAXIS' : 32, 
            'PCOMP' : 33, 'POBJ' : 34, 'POSS' : 35, 'PRECONJ' : 36, 'PREDET' : 37, 
            'PREP' : 38, 'PRT' : 39, 'PUNCT' : 40, 'QUANTMOD' : 41, 
            'RELCL' : 42, 'ROOT' : 43, 'XCOMP' : 44}

def get_pos(statement):
  doc = nlp(statement)
  taglist = []
  for token in doc:
    taglist.append(pos_dict.get(token.pos_,max(pos_dict.values())))
  return taglist


In [None]:
train_data['pos_id'] = train_data['statement'].apply(get_pos)
val_data['pos_id'] = val_data['statement'].apply(get_pos)
test_data['pos_id'] = test_data['statement'].apply(get_pos)


In [None]:
## word embeddings
embeddings = {}
with open("/content/drive/MyDrive/glove.6B.100d.txt") as file_object:
  for line in file_object:
    embedded_word = line.split()
    word = embedded_word[0]
    embed = np.array(embedded_word[1:], dtype="float32")
    embeddings[word.lower()]= embed

emb_dimension = 100

num_words = len(vocab_dict) + 1
emb_matrix = np.zeros((num_words, emb_dimension))
for word, i in vocab_dict.items():
    emb_vector = embeddings.get(word)
    if emb_vector is not None:
        emb_matrix[i] = emb_vector

emb_index = None
pos_embeddings = np.identity(max(pos_dict.values()), dtype=int)

In [None]:
from keras.utils import pad_sequences

In [None]:
###data preprocessing
# hyperparameters setting
vocab_length = len(vocab_dict.keys())
hidden_size = emb_dimension 
lstm_size = 100
num_steps = 15
num_epochs = 30
batch_size = 40
kernel_sizes = [3,3]
filter_size = 128


X_train = train_data['word_id']
X_val = val_data['word_id']
X_test = test_data['word_id']

Y_train = train_data['output']
Y_train = keras.utils.to_categorical(Y_train, num_classes=6)

Y_val = val_data['output']
Y_val = keras.utils.to_categorical(Y_val, num_classes=6)

X_train = pad_sequences(X_train, maxlen=num_steps, padding='post',truncating='post')
X_val = pad_sequences(X_val, maxlen=num_steps, padding='post',truncating='post')
X_test = pad_sequences(X_test, maxlen=num_steps, padding='post',truncating='post')

X_train_pos = train_data['pos_id']
X_val_pos = val_data['pos_id']
X_test_pos = test_data['pos_id']

X_train_pos = pad_sequences(X_train_pos, maxlen=num_steps, padding='post',truncating='post')
X_val_pos = pad_sequences(X_val_pos, maxlen=num_steps, padding='post',truncating='post')
X_test_pos = pad_sequences(X_test_pos, maxlen=num_steps, padding='post',truncating='post')



In [None]:
use_pos=False
def train(model, name, use_pos=False):
  sgd = optimizers.SGD(lr=0.25, clipvalue=0.4, nesterov=True)
  adam = optimizers.Adam(lr=0.00075, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
  model.compile(optimizer=sgd,loss='categorical_crossentropy',metrics=['categorical_accuracy'],)
  tb = TensorBoard()
  csv_logger = keras.callbacks.CSVLogger('training.log')
  filepath= name+"_weights_best.hdf5"
  checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_categorical_accuracy', 
                                             verbose=1, save_best_only=True, mode='max')
         
  if use_pos:
    model.fit(
      {'main_input': X_train, 'pos_input': X_train_pos},
      {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
      validation_data = (
          {'main_input': X_val, 'pos_input': X_val_pos},
          {'main_output': Y_val}
      ), 
      callbacks=[tb,csv_logger,checkpoint])
  else:
    model.fit(
      {'main_input': X_train},
      {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
      validation_data = (
          {'main_input': X_val},
          {'main_output': Y_val}
      ),
       callbacks=[tb,csv_logger,checkpoint]
      )
    
  predict_test(model,name,use_pos)
      
def predict_test(model, name, use_pos=False):   
  preds = []
  if use_pos:
    preds = model.predict([X_test,X_test_pos], batch_size=batch_size, verbose=1)

  else:
    preds = model.predict([X_test], batch_size=batch_size, verbose=1)

  predictions = np.array([np.argmax(pred) for pred in preds])
  print(predictions)
  df=pd.DataFrame(predictions)
  df['id']=range(len(predictions))
  df['label']=predictions
  df=df.drop(0,axis=1)
  df.to_csv('output.csv',index=False)


def predict_train(model, name, use_pos=False):
  preds = []
  if use_pos:
    preds = model.predict([X_train,X_train_pos], batch_size=batch_size, verbose=1)

  else:
    preds = model.predict([X_train], batch_size=batch_size, verbose=1)  
  
  predictions = np.array([np.argmax(pred) for pred in preds])
  return predictions


In [None]:
from keras.layers import Embedding


In [None]:
filter_without_pos = []
filter_with_pos = []



statement_input = Input(shape=(num_steps,), dtype='int32', name='main_input')
x_stmt = Embedding(vocab_length+1,emb_dimension,weights=[emb_matrix],input_length=num_steps,trainable=False)(statement_input) 

# pos embed LSTM
pos_input = Input(shape=(num_steps,), dtype='int32', name='pos_input')
x_pos = Embedding(max(pos_dict.values()), max(pos_dict.values()), weights=[pos_embeddings], input_length=num_steps, trainable=False)(pos_input)

for kernel in kernel_sizes:
    x_1 = Conv1D(filters=filter_size,kernel_size=kernel)(x_stmt)
    x_1 = GlobalMaxPool1D()(x_1)
    filter_without_pos.append(x_1)
    
    x_2 = Conv1D(filters=filter_size,kernel_size=kernel)(x_pos)
    x_2 = GlobalMaxPool1D()(x_2)
    filter_with_pos.append(x_2)
    
    
conv_in1 = keras.layers.concatenate(filter_without_pos)
conv_in1 = Dropout(0.6)(conv_in1)
conv_in1 = Dense(128, activation='relu')(conv_in1)

conv_in2 = keras.layers.concatenate(filter_with_pos)
conv_in2 = Dropout(0.6)(conv_in2)
conv_in2 = Dense(128, activation='relu')(conv_in2)

x = conv_in1
if use_pos:
    x = keras.layers.concatenate([conv_in1, conv_in2])
else:
  x = conv_in1


main_output = Dense(6, activation='softmax', name='main_output')(x)

if use_pos:
  model_cnn = Model(inputs=[statement_input, pos_input], outputs=[main_output])
else:
  model_cnn = Model(inputs=[statement_input], outputs=[main_output])
    


In [None]:
train(model_cnn,'cnn',use_pos=False)

Epoch 1/30


  super(SGD, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Epoch 1: val_categorical_accuracy improved from -inf to 0.21409, saving model to cnn_weights_best.hdf5
Epoch 2/30
Epoch 2: val_categorical_accuracy improved from 0.21409 to 0.21618, saving model to cnn_weights_best.hdf5
Epoch 3/30
Epoch 3: val_categorical_accuracy did not improve from 0.21618
Epoch 4/30
Epoch 4: val_categorical_accuracy improved from 0.21618 to 0.22106, saving model to cnn_weights_best.hdf5
Epoch 5/30
Epoch 5: val_categorical_accuracy did not improve from 0.22106
Epoch 6/30
Epoch 6: val_categorical_accuracy improved from 0.22106 to 0.23780, saving model to cnn_weights_best.hdf5
Epoch 7/30
Epoch 7: val_categorical_accuracy did not improve from 0.23780
Epoch 8/30
Epoch 8: val_categorical_accuracy did not improve from 0.23780
Epoch 9/30
Epoch 9: val_categorical_accuracy did not improve from 0.23780
Epoch 10/30
Epoch 10: val_categorical_accuracy did not improve from 0.23780
Epoch 11/30
Epoch 11: val_categorical_accuracy did not improve from 0.23780
Epoch 12/30
Epoch 12: va

In [None]:
# LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_length+1, hidden_size, input_length=num_steps))
model_lstm.add(Bidirectional(LSTM(hidden_size)))
model_lstm.add(Dense(6, activation='softmax'))


# statement embed LSTM
statement_input = Input(shape=(num_steps,), dtype='int32', name='main_input')
x = Embedding(vocab_length+1,emb_dimension,weights=[emb_matrix],input_length=num_steps,trainable=False)(statement_input) 
lstm_in = LSTM(lstm_size,dropout=0.2)(x)



# pos embed LSTM
pos_input = Input(shape=(num_steps,), dtype='int32', name='pos_input')
x2 = Embedding(max(pos_dict.values()), max(pos_dict.values()), weights=[pos_embeddings], input_length=num_steps, trainable=False)(pos_input)
lstm_in2 = LSTM(lstm_size, dropout=0.2)(x2)


if use_pos :
  x = keras.layers.concatenate([lstm_in, lstm_in2])
else:
  x = lstm_in

main_output = Dense(6, activation='softmax', name='main_output')(x)

if use_pos:
  model_lstm = Model(inputs=[statement_input, pos_input], outputs=[main_output])
else:
  model_lstm = Model(inputs=[statement_input], outputs=[main_output])

In [None]:
train(model_lstm,'lstm',use_pos=False)

Epoch 1/30
Epoch 1: val_categorical_accuracy improved from -inf to 0.18410, saving model to lstm_weights_best.hdf5
Epoch 2/30
Epoch 2: val_categorical_accuracy improved from 0.18410 to 0.22803, saving model to lstm_weights_best.hdf5
Epoch 3/30
Epoch 3: val_categorical_accuracy improved from 0.22803 to 0.25035, saving model to lstm_weights_best.hdf5
Epoch 4/30
Epoch 4: val_categorical_accuracy improved from 0.25035 to 0.25523, saving model to lstm_weights_best.hdf5
Epoch 5/30
Epoch 5: val_categorical_accuracy did not improve from 0.25523
Epoch 6/30
Epoch 6: val_categorical_accuracy did not improve from 0.25523
Epoch 7/30
Epoch 7: val_categorical_accuracy did not improve from 0.25523
Epoch 8/30
Epoch 8: val_categorical_accuracy did not improve from 0.25523
Epoch 9/30
Epoch 9: val_categorical_accuracy did not improve from 0.25523
Epoch 10/30
Epoch 10: val_categorical_accuracy did not improve from 0.25523
Epoch 11/30
Epoch 11: val_categorical_accuracy did not improve from 0.25523
Epoch 12/

In [None]:
y_pred= predict_train(model_lstm,'lstm')



In [None]:
from sklearn.metrics import f1_score
f1_score(train_data['output'], y_pred, average='macro')

0.5086813276100964