In [None]:
!pip install tensorflow-addons -q
!pip install scikit-plot -q

In [None]:
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
from sklearn.metrics               import confusion_matrix, accuracy_score

In [None]:
import seaborn                     as sns                           # statistical data visualization
import tensorflow                  as tf                            # build machine learning models
import scikitplot                  as skplt                         # data visualization and machine-learning metrics

from sklearn.model_selection       import train_test_split          # split into training and test sets
from keras.utils                   import to_categorical
from keras.preprocessing.text      import one_hot                   # create tokens
from keras.preprocessing.sequence  import pad_sequences             # create padding
from sklearn.linear_model          import LogisticRegression
from keras.models                  import Sequential
from keras.layers                  import (Embedding,
                                           Dense,
                                           LSTM,
                                           Bidirectional,
                                           Dropout)

# Decision Tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

In [None]:
DATA = "/content/gdrive/My Drive/Projekt_MIO/dataset_PL/converted-exp-PL.tsv"

In [None]:
from google.colab import drive
import os
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/')

# Data

In [None]:
dfOry = pd.read_csv(DATA,sep='\t')
dfOry.head()

In [None]:
categories = dfOry['statementState'].value_counts()

plt.bar(categories.index, categories.values, color ='maroon',
        width = 0.4)

plt.xlabel("Classes")
plt.ylabel("Number")
plt.title("Classes in dataset")
plt.show()

# How many empty labels
dfOry.isnull().sum()

In [None]:
# create the 'clean_text' function that receives the 'text' argument
def clean_text(text):
    # create variable 'word' and divide the text by words and use whitespace as delimiter
    words = str(text).split()

    # convert words to lowercase by adding space to the end of each word
    words = [i.lower() + " " for i in words]

    # join words into a single string, but separated by spaces
    words = " ".join(words)

    # remove punctuation from strings, using the st.punctuation method as an argument
    words = words.translate(words.maketrans('', '', string.punctuation))

    return words

In [None]:
dfOry['statementText'] = dfOry['statementText'].apply(clean_text)
dfOry['name'] = dfOry['name'].apply(clean_text)
dfOry['party'] = dfOry['party'].apply(clean_text)

In [None]:
df = dfOry[['name', 'party', 'statementText', 'statementState']]

In [None]:
df['statementState'].value_counts()

In [None]:
train, df_temp = train_test_split(df, test_size=0.25, stratify=df['statementState'], shuffle=True, random_state=123)
validation, test = train_test_split(df, test_size=0.5, stratify=df['statementState'], shuffle=True, random_state=123)
statementStateMap = { "FALSE": 0, "TRUE": 1, "UNVERIFIABLE": 2, "MISLEADING": 3 }

In [None]:
# converting strings to numbers of test Data
def convertToNumbers(x_data_of_strings, y_data_of_strings, vocab_size, sent_length):
    onehot_rep_train = [one_hot(word, vocab_size) for word in x_data_of_strings]
    ebedded_doc_train = pad_sequences(onehot_rep_train, padding='pre', maxlen = sent_length)
    state_train = np.array(list(map(statementStateMap.get, y_data_of_strings)))
    return ebedded_doc_train, state_train

## Decision Tree Classifier

In [None]:
from tqdm import tqdm
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
def preprocess_text(text_data):
    preprocessed_text = []

    for sentence in tqdm(text_data):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        preprocessed_text.append(' '.join(token.lower()
                                  for token in str(sentence).split()
                                  if token not in stopwords.words('english')))

    return preprocessed_text

In [None]:
preprocessed_review = preprocess_text(df['statementText'].values)
df['statementText'] = preprocessed_review
df['statementText']

In [None]:
# Convert the statementState column to integers using the mapping dictionary
statementState = df['statementState'].map(statementStateMap)

# Split the data
X = df['statementText']
y = statementState
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [None]:
# Vectorize the text data
vectorization = TfidfVectorizer(strip_accents='ascii')
train_statement_vectorized = vectorization.fit_transform(x_train)
test_statement_vectorized = vectorization.transform(x_test)

# Train the model
decisionTreeClassifierModel = DecisionTreeClassifier()
decisionTreeClassifierModel.fit(train_statement_vectorized, y_train)

# Evaluate the model (optional)
predictions = decisionTreeClassifierModel.predict(test_statement_vectorized)

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

In [None]:
import graphviz
from sklearn import tree

# Get feature names from the TfidfVectorizer
feature_names = vectorization.get_feature_names_out()

# Get unique class names from the training labels
class_names = [str(label) for label in sorted(set(y_train))]

# Visualize the model
dot_decision_tree = tree.export_graphviz(decisionTreeClassifierModel, out_file=None, feature_names=feature_names,
                                         class_names=class_names, filled=True)
decision_tree_plot = graphviz.Source(dot_decision_tree, format='png')
decision_tree_plot.render("decision_tree") # Save the plot as a file
decision_tree_plot # Display the plot


# Logistic Regression

In [None]:
vectorization = TfidfVectorizer()
x_train_fit = vectorization.fit_transform(x_train)
x_test_fit = vectorization.transform(x_test)

In [None]:
logisticRegressionModel = LogisticRegression()
logisticRegressionModel.fit(x_train_fit, y_train)

# testing the model
print(accuracy_score(y_train, logisticRegressionModel.predict(x_train_fit)))
print(accuracy_score(y_test, logisticRegressionModel.predict(x_test_fit)))

# Bidirectional LSTM model

In [None]:
voc_size=10000
embedding_vector_features=40
sent_length = 500

In [None]:
# Define the mapping from class names to numbers (0 to 3)
statementStateMap = {'TRUE': 0, 'FALSE': 1, 'MISLEADING': 2, 'UNVERIFIABLE': 3}

# Define the function to convert strings to numbers
def convertToNumbers2(x_data_of_strings, y_data_of_strings, vocab_size, sent_length):
    onehot_rep_train = [one_hot(word, vocab_size) for word in x_data_of_strings]
    ebedded_doc_train = pad_sequences(onehot_rep_train, padding='pre', maxlen=sent_length)
    state_train = np.array(list(map(statementStateMap.get, y_data_of_strings)))
    return ebedded_doc_train, state_train


# Assuming 'train' and 'validation' are your DataFrames
X_train, Y_train = convertToNumbers2(train['statementText'], train['statementState'], voc_size, sent_length)
X_test, Y_test = convertToNumbers2(validation['statementText'], validation['statementState'], voc_size, sent_length)

# Now X_train, Y_train, X_test, and Y_test are ready to be used in your model

Y_train = to_categorical(Y_train, num_classes=4)
Y_test = to_categorical(Y_test, num_classes=4)

In [None]:
#Setting up vocabulary size
sequentialModel1 = ()
sequentialModel1.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
sequentialModel1.add(Bidirectional(LSTM(100)))  # Bidirectional LSTM layer
sequentialModel1.add(Dropout(0.3))
sequentialModel1.add(Dense(4, activation='softmax'))  # Use softmax for multi-class classification
sequentialModel1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Use categorical crossentropy
print(sequentialModel1.summary())

# Train the model
sequentialModel1.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=64)

In [None]:
pred = sequentialModel1.predict(X_test)

# Convert predicted probabilities to class labels
pred_class = np.argmax(pred, axis=1)

# If Y_test is one-hot encoded, convert it back to class labels
Y_test_class = np.argmax(Y_test, axis=1)

# Calculate accuracy
accuracy = accuracy_score(Y_test_class, pred_class)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
conf_matrix = confusion_matrix(Y_test_class, pred_class)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["FALSE", "TRUE", "UNVERIFIABLE", "MISLEADING"], yticklabels=["FALSE", "TRUE", "UNVERIFIABLE", "MISLEADING"])
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.title('Confusion Matrix')
plt.show()

# LSTM

In [None]:
sequentialModel = Sequential()
sequentialModel.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length))
sequentialModel.add(LSTM(100))
# In Keras, LSTM layer LSTM(100), the number 100 represents the number of units or neurons in the LSTM layer.
sequentialModel.add(Dense(4, activation = 'sigmoid'))
sequentialModel.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics=['accuracy'])
print(sequentialModel.summary())

sequentialModel.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10 , batch_size=64)

In [None]:
y_pred = np.where(sequentialModel.predict(X_test) > 0.5, 1,0)
print(np.sum(y_pred) / y_pred.shape[0])

# Save weights

In [None]:
# Save the model weights
sequentialModel1.save_weights('/content/gdrive/My Drive/Projekt_MIO/model_weights.h5')
print("Model weights saved to model_weights.h5")