# Description

This notebook contains
- training of catboost decision tree models for binary classification 
- training of catboost decision tree models for 6-way classification

Libraries used
- keras for tokenizer
- catboost for decision tree model

## Model Description
### Binary - cat_model
- tokenize the text files of the statements
- pass through the catboost decision tree model with a binary CatBoostClassifier

### 6 way - cat_model_6
- tokenize the text files of the statements
- pass through the catboost decision tree model with a 6 class CatBoostClassifier

## Result
### Binary - cat_model
- Test accuracy - 56.43%
 
### 6 way - cat_model_6
- Test accuracy - 21.23%

## Model file
### Binary - cat_model
- cat_model

### 6 way - cat_model_6
- cat_model_6


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import pandas as pd

In [None]:
dataset_dir = "dataset"

train_data_file = os.path.join(dataset_dir, "train2.tsv")
test_data_file = os.path.join(dataset_dir, "test2.tsv")
val_data_file = os.path.join(dataset_dir, "val2.tsv")

In [None]:
# column names are taken from the readme.md of the LIAR-PLUS github repo -
# link to repo - https://github.com/Tariq60/LIAR-PLUS

col_names = ["id", "label", "statement", "subject", "speaker", "speaker_job", "state_info", "party", \
             "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context", "ex_just"]

In [None]:
train_data = pd.read_csv(train_data_file, sep = '\t', header = None, names = col_names,)# na_values = ["NaN"], na_filter = True)
test_data = pd.read_csv(test_data_file, sep = '\t', header = None, names = col_names)
val_data = pd.read_csv(val_data_file, sep = '\t', header = None, names = col_names)

In [None]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
max_len_seq = 1000
max_no_of_words = 20000

In [None]:
# tokenizing the statements

tokenizer = Tokenizer(num_words=max_no_of_words)

tokenizer.fit_on_texts(list(train_data["statement"]))

train_sequences = tokenizer.texts_to_sequences(list(train_data["statement"]))
val_sequences = tokenizer.texts_to_sequences(list(val_data["statement"]))
test_sequences = tokenizer.texts_to_sequences(list(test_data["statement"]))

train_seq = np.array(pad_sequences(train_sequences, maxlen = max_len_seq))
val_seq = np.array(pad_sequences(val_sequences, maxlen = max_len_seq))
test_seq = np.array(pad_sequences(test_sequences, maxlen = max_len_seq))

In [None]:
tf_val = {"pants-fire":0, "false":0, "barely-true":0, "half-true":1, "mostly-true":1, "true":1}
train_tf = np.array(list(map(lambda l: tf_val[l], list(train_data["label"]))))
train_cat_tf = to_categorical(train_tf)

val_tf = np.array(list(map(lambda l: tf_val[l], list(val_data["label"]))))
val_cat_tf = to_categorical(val_tf)

test_tf = np.array(list(map(lambda l: tf_val[l], list(test_data["label"]))))
test_cat_tf = to_categorical(test_tf) 

In [None]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(loss_function = 'CrossEntropy')
cat_model.fit(train_seq, train_tf, eval_set = (val_seq, val_tf))

In [None]:
cat_model.score(test_seq, test_tf)

In [None]:
pred = cat_model.predict(test_seq)

print(pred[:20])
print(test_tf[:20])

In [None]:
cat_model.save_model("cat_model")

In [None]:
six_val = {"pants-fire":0, "false":1, "barely-true":2, "half-true":3, "mostly-true":4, "true":5}
rev_six_val = dict(map(reversed, six_val.items()))

train_6 = np.array(list(map(lambda l: six_val[l], list(train_data["label"]))))
train_cat_6 = to_categorical(train_6)

val_6 = np.array(list(map(lambda l: six_val[l], list(val_data["label"]))))
val_cat_6 = to_categorical(val_6)

test_6 = np.array(list(map(lambda l: six_val[l], list(test_data["label"]))))
test_cat_6 = to_categorical(test_6)

In [None]:
cat_model_6 = CatBoostClassifier(loss_function = 'MultiClass')
cat_model_6.fit(train_seq, train_6, eval_set = (val_seq, val_6))

In [None]:
cat_model_6.score(test_seq, test_6)

In [None]:
pred_6 = cat_model_6.predict(test_seq)

print(pred_6[:20])
print(test_6[:20])

In [None]:
cat_model_6.save_model("cat_model_6")