# Import libraries

In [1]:
import nltk
import stanza
import ast
from afinn import Afinn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.corpus import verbnet as vn
from nltk.corpus import opinion_lexicon
from nltk.wsd import lesk
from nltk.corpus import wordnet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import openpyxl

# Preprocessed Data Loading

In [2]:
# Load the data
column_names = ["Sentence", "Label", "tokens_pos", "entities", "senses", "dependencies", "swn_scores", "afinn_score", "subj_scores", "final_sentiment", "negations", "final_sentiment_adj"]

df_train_preprocessed = pd.read_csv('C:/Users/Anastasiia Belkina/MANNHEIM/MASTER_THESIS_CODE/Rule-Based Classifier/df_train_shuffled.txt', sep='\t', names=column_names)
df_valid_preprocessed = pd.read_csv('C:/Users/Anastasiia Belkina/MANNHEIM/MASTER_THESIS_CODE/Rule-Based Classifier/df_valid_shuffled.txt', sep='\t', names=column_names)

# Remove leading and trailing spaces in the "Sentence" column
df_train_preprocessed['Sentence'] = df_train_preprocessed['Sentence'].str.strip()
df_valid_preprocessed['Sentence'] = df_valid_preprocessed['Sentence'].str.strip()

# Delete columns "subj_scores", "final_sentiment", "negations", "final_sentiment_adj"
df_train_ready = df_train_preprocessed.drop(columns = ["swn_scores", "afinn_score", "subj_scores", "final_sentiment", "negations", "final_sentiment_adj"])
df_valid_ready = df_valid_preprocessed.drop(columns = ["swn_scores", "afinn_score", "subj_scores", "final_sentiment", "negations", "final_sentiment_adj"])

# Shuffle the data
#df_train_ready = df_train_preprocessed.sample(frac=1).reset_index(drop=True)
#df_valid_ready = df_valid_preprocessed.sample(frac=1).reset_index(drop=True)

# Merging Labels

In [3]:
# Mapping dictionary
label_mapping = {2: 1, 3: 2, 4: 2}

# 0 - neutral, 1 - positive, 2 - negative

df_train_ready_merged = df_train_ready
df_valid_ready_merged = df_valid_ready

# Apply the mapping to the 'Label' column
df_train_ready_merged['Label'] = df_train_ready_merged['Label'].replace(label_mapping)
df_valid_ready_merged['Label'] = df_valid_ready_merged['Label'].replace(label_mapping)

# Turning strings back to lists and tuples

In [4]:
def convert_to_list(dependencies_str):
    # Check if it's a string and if it appears to be in the list of tuples format
    if isinstance(dependencies_str, str) and dependencies_str.startswith("[") and dependencies_str.endswith("]"):
        try:
            # Convert string representation of list back to actual list of tuples
            return ast.literal_eval(dependencies_str)
        except (ValueError, SyntaxError) as e:
            print(f"Error parsing: {dependencies_str}")
            raise e
    elif isinstance(dependencies_str, list):
        # If it's already a list, return as is
        return dependencies_str
    else:
        # If it's another unexpected type, return as is or handle appropriately
        return dependencies_str

def get_sense(tokens):
    #print(tokens)
    senses = []
    for item in tokens:
        #print(item)
        if isinstance(item, tuple) and len(item) == 2:
            token, pos = item
            sense = lesk([token], token)
            senses.append((token, sense))
        else:
            # Handle cases where the token doesn't meet the expected structure
            print(f"Unexpected format: {item}")
            senses.append((item, None))
    return senses

In [5]:
# Apply the function to your datasets
df_train_ready_merged['dependencies'] = df_train_ready_merged['dependencies'].apply(convert_to_list)
df_valid_ready_merged['dependencies'] = df_valid_ready_merged['dependencies'].apply(convert_to_list)
df_train_ready_merged['tokens_pos'] = df_train_ready_merged['tokens_pos'].apply(convert_to_list)
df_valid_ready_merged['tokens_pos'] = df_valid_ready_merged['tokens_pos'].apply(convert_to_list)
df_train_ready_merged['entities'] = df_train_ready_merged['entities'].apply(convert_to_list)
df_valid_ready_merged['entities'] = df_valid_ready_merged['entities'].apply(convert_to_list)
df_train_ready_merged['senses'] = df_train_ready_merged['tokens_pos'].apply(get_sense)
df_valid_ready_merged['senses'] = df_valid_ready_merged['tokens_pos'].apply(get_sense)

# Making a small set for tests

In [6]:
df_train_ready_merged_small = df_train_ready_merged.head(10)
df_valid_ready_merged_small = df_valid_ready_merged.head(10)

# Following the Modified Algorithm of Blame/Praise Identification

## Older versions

In [7]:
"""
    # Step 2: Process root verbs
    for i, root in enumerate(roots):
        root_verb, root_index = root
        #print(root_verb, root_index)
        # Check if the root verb is valid (foreseeability and not coercion)
        for token, pos in tokens_pos:
            #print(token, pos)
            if token == root_verb and pos == 'VERB':
                #print("Found VERB")
                if is_foreseeability_verb(root_verb) and not is_coercion_verb(root_verb):
                    #print("F and not C") 
                    root_verbs.append((root_verb, i+1))
        # Check for any conj attached to this root verb and add it if valid
        for j, conj in enumerate(dependencies):
            conj_word, conj_head, conj_rel = conj
            if conj_head == root_index and conj_rel == 'conj':
                for token_conj, pos_conj in tokens_pos:
                    if token_conj == conj_word and 'VERB' in pos_conj:
                        if is_foreseeability_verb(conj_word) and not is_coercion_verb(conj_word):
                            root_verbs.append((conj_word, j+1))
    
    #print(root_verbs)
"""


'\n    # Step 2: Process root verbs\n    for i, root in enumerate(roots):\n        root_verb, root_index = root\n        #print(root_verb, root_index)\n        # Check if the root verb is valid (foreseeability and not coercion)\n        for token, pos in tokens_pos:\n            #print(token, pos)\n            if token == root_verb and pos == \'VERB\':\n                #print("Found VERB")\n                if is_foreseeability_verb(root_verb) and not is_coercion_verb(root_verb):\n                    #print("F and not C") \n                    root_verbs.append((root_verb, i+1))\n        # Check for any conj attached to this root verb and add it if valid\n        for j, conj in enumerate(dependencies):\n            conj_word, conj_head, conj_rel = conj\n            if conj_head == root_index and conj_rel == \'conj\':\n                for token_conj, pos_conj in tokens_pos:\n                    if token_conj == conj_word and \'VERB\' in pos_conj:\n                        if is_foreseeabi

In [8]:
'''
# The main function to process each sentence
def find_valid_verbs(row):
    print("NEW ROW")
    
    dependencies = row['dependencies']
    tokens_pos = row['tokens_pos']

    counter_i = 0
    
    # Lists to store categorized verbs
    root_verbs = []
    xcomp_verbs = []
    ccomp_verbs = []
    parataxis_verbs = []
    advcl_verbs = []
    
    # Step 1: Identify all root verbs and their indices
    #roots = []
    for i, dep in enumerate(dependencies):
        if len(dep) == 3:
            word, head, deprel = dep
            if deprel == 'root':
                #roots.append((word, i + 1))  # Save the root verb and its index (i+1)
                for token, pos in tokens_pos:
                    if token == word and pos == 'VERB':
                        #print("Found VERB")
                        if is_foreseeability_verb(word) and not is_coercion_verb(word):
                            #print("F and not C") 
                            root_verbs.append((word, i+1))
                # Check for any conj attached to this root verb and add it if valid
                for j, conj in enumerate(dependencies):
                    conj_word, conj_head, conj_rel = conj
                    if conj_head == i+1 and conj_rel == 'conj':
                        for token_conj, pos_conj in tokens_pos:
                            if token_conj == conj_word and 'VERB' in pos_conj:
                                if is_foreseeability_verb(conj_word) and not is_coercion_verb(conj_word):
                                    root_verbs.append((conj_word, j+1))
    #print(root_verbs)

    
    # TILL HERE WORKS 
    
    # ВОПРОС С БУМАЖКИ
    
    # Step 3: Process each root verb and find related tags (xcomp, ccomp, conj, parataxis, advcl)
    for root_verb, root_index in root_verbs:
        for i, dep in enumerate(dependencies):
            if len(dep) == 3:
                dep_word, dep_head, dep_rel = dep
                
                # Handle each type of relation (xcomp, ccomp, parataxis, advcl)
                #print(dep_head, root_index, dep_rel)
                if dep_head == root_index and dep_rel in ['xcomp', 'ccomp', 'parataxis', 'advcl']:
                    #print("found one of xcomp, ccomp, parataxis, advcl")
                    #print(dep_word, dep_head, dep_rel)
                    for token, pos in tokens_pos:
                        if token == dep_word and 'VERB' in pos:
                            #print(token, pos)
                            if is_foreseeability_verb(dep_word) and not is_coercion_verb(dep_word):
                                # Add to the appropriate list based on dep_rel
                                if dep_rel == 'xcomp':
                                    xcomp_verbs.append(dep_word)
                                elif dep_rel == 'ccomp':
                                    ccomp_verbs.append(dep_word)
                                elif dep_rel == 'parataxis':
                                    parataxis_verbs.append(dep_word)
                                elif dep_rel == 'advcl':
                                    advcl_verbs.append(dep_word)
                                
                                # Check for any conj attached to this verb and add it
                                for conj_word, conj_head, conj_rel in dependencies:
                                    if conj_head == i + 1 and conj_rel == 'conj':  # Look for conj attached to the current word
                                        # Check if the conj word is a verb and passes the checks
                                        for token, pos in tokens_pos:
                                            if token == conj_word and 'VERB' in pos:
                                                if is_foreseeability_verb(conj_word) and not is_coercion_verb(conj_word):
                                                    if dep_rel == 'xcomp':
                                                        xcomp_verbs.append(conj_word)
                                                    elif dep_rel == 'ccomp':
                                                        ccomp_verbs.append(conj_word)
                                                    elif dep_rel == 'parataxis':
                                                        parataxis_verbs.append(conj_word)
                                                    elif dep_rel == 'advcl':
                                                        advcl_verbs.append(conj_word)
                                #break
    
    # Step 4: If any of the verb lists are not empty, assign to related category
    if root_verbs or xcomp_verbs or ccomp_verbs or parataxis_verbs or advcl_verbs:
        print(root_verbs)
        print(xcomp_verbs)
        print(ccomp_verbs)
        print(parataxis_verbs)
        print(advcl_verbs)
        return 'related'
    else:
        return 'others'
'''

'\n# The main function to process each sentence\ndef find_valid_verbs(row):\n    print("NEW ROW")\n    \n    dependencies = row[\'dependencies\']\n    tokens_pos = row[\'tokens_pos\']\n\n    counter_i = 0\n    \n    # Lists to store categorized verbs\n    root_verbs = []\n    xcomp_verbs = []\n    ccomp_verbs = []\n    parataxis_verbs = []\n    advcl_verbs = []\n    \n    # Step 1: Identify all root verbs and their indices\n    #roots = []\n    for i, dep in enumerate(dependencies):\n        if len(dep) == 3:\n            word, head, deprel = dep\n            if deprel == \'root\':\n                #roots.append((word, i + 1))  # Save the root verb and its index (i+1)\n                for token, pos in tokens_pos:\n                    if token == word and pos == \'VERB\':\n                        #print("Found VERB")\n                        if is_foreseeability_verb(word) and not is_coercion_verb(word):\n                            #print("F and not C") \n                            r

In [9]:
'''
# TRYING WITH COUNTER

# The main function to process each sentence
def find_valid_verbs(row):
    print("NEW ROW")
    
    dependencies = row['dependencies']
    tokens_pos = row['tokens_pos']

    counter_i = 0
    counter_j = 0
    
    # Lists to store categorized verbs
    roots = []
    root_verbs = []
    xcomp_verbs = []
    ccomp_verbs = []
    parataxis_verbs = []
    advcl_verbs = []
    
    # Step 1: Identify all root verbs and their indices
    #roots = []
    for dep in dependencies:
        if len(dep) == 3:
            word, head, deprel = dep
            counter_i = counter_i + 1
            #print(word, counter_i)
            if deprel == 'punct' and (word == "." or word == ":") and head == roots[0][1]:
                #print("Update Counter I")
                counter_i = 0
                
            if deprel == 'root':
                #roots.append((word, i + 1))  # Save the root verb and its index (i+1)
                roots.append((word, counter_i))
                for token, pos in tokens_pos:
                    if token == word and pos == 'VERB':
                        #print("Found VERB")
                        if is_foreseeability_verb(word) and not is_coercion_verb(word):
                            #print("F and not C") 
                            root_verbs.append((word, counter_i))

                counter_j = 0
                # Check for any conj attached to this root verb and add it if valid
                for conj in dependencies:
                    if len(conj) == 3:
                        conj_word, conj_head, conj_rel = conj
                        counter_j = counter_j +1
                        #print(conj_word, counter_j)
                        if conj_rel == 'punct' and (conj_word == "." or conj_word == ":") and conj_head == roots[0][1]:
                            #print("Update Counter J")
                            counter_j = 0
                        if conj_head == counter_i and conj_rel == 'conj':
                            for token_conj, pos_conj in tokens_pos:
                                if token_conj == conj_word and 'VERB' in pos_conj:
                                    if is_foreseeability_verb(conj_word) and not is_coercion_verb(conj_word):
                                        root_verbs.append((conj_word, counter_j))
                    
                    
        
    #print(root_verbs)

    
    # TILL HERE WORKS 
    
    # ВОПРОС С БУМАЖКИ
    
    # Step 3: Process each root verb and find related tags (xcomp, ccomp, conj, parataxis, advcl)
    for root_verb, root_index in root_verbs:
        for i, dep in enumerate(dependencies):
            if len(dep) == 3:
                dep_word, dep_head, dep_rel = dep
                
                # Handle each type of relation (xcomp, ccomp, parataxis, advcl)
                #print(dep_head, root_index, dep_rel)
                if dep_head == root_index and dep_rel in ['xcomp', 'ccomp', 'parataxis', 'advcl']:
                    #print("found one of xcomp, ccomp, parataxis, advcl")
                    #print(dep_word, dep_head, dep_rel)
                    for token, pos in tokens_pos:
                        if token == dep_word and 'VERB' in pos:
                            #print(token, pos)
                            if is_foreseeability_verb(dep_word) and not is_coercion_verb(dep_word):
                                # Add to the appropriate list based on dep_rel
                                if dep_rel == 'xcomp':
                                    xcomp_verbs.append(dep_word)
                                elif dep_rel == 'ccomp':
                                    ccomp_verbs.append(dep_word)
                                elif dep_rel == 'parataxis':
                                    parataxis_verbs.append(dep_word)
                                elif dep_rel == 'advcl':
                                    advcl_verbs.append(dep_word)
                                
                                # Check for any conj attached to this verb and add it
                                for conj_word, conj_head, conj_rel in dependencies:
                                    if conj_head == i + 1 and conj_rel == 'conj':  # Look for conj attached to the current word
                                        # Check if the conj word is a verb and passes the checks
                                        for token, pos in tokens_pos:
                                            if token == conj_word and 'VERB' in pos:
                                                if is_foreseeability_verb(conj_word) and not is_coercion_verb(conj_word):
                                                    if dep_rel == 'xcomp':
                                                        xcomp_verbs.append(conj_word)
                                                    elif dep_rel == 'ccomp':
                                                        ccomp_verbs.append(conj_word)
                                                    elif dep_rel == 'parataxis':
                                                        parataxis_verbs.append(conj_word)
                                                    elif dep_rel == 'advcl':
                                                        advcl_verbs.append(conj_word)
                                #break
    
    # Step 4: If any of the verb lists are not empty, assign to related category
    if root_verbs or xcomp_verbs or ccomp_verbs or parataxis_verbs or advcl_verbs:
        print(roots)
        print(root_verbs)
        print(xcomp_verbs)
        print(ccomp_verbs)
        print(parataxis_verbs)
        print(advcl_verbs)
        return 'related'
    else:
        return 'others'
'''

'\n# TRYING WITH COUNTER\n\n# The main function to process each sentence\ndef find_valid_verbs(row):\n    print("NEW ROW")\n    \n    dependencies = row[\'dependencies\']\n    tokens_pos = row[\'tokens_pos\']\n\n    counter_i = 0\n    counter_j = 0\n    \n    # Lists to store categorized verbs\n    roots = []\n    root_verbs = []\n    xcomp_verbs = []\n    ccomp_verbs = []\n    parataxis_verbs = []\n    advcl_verbs = []\n    \n    # Step 1: Identify all root verbs and their indices\n    #roots = []\n    for dep in dependencies:\n        if len(dep) == 3:\n            word, head, deprel = dep\n            counter_i = counter_i + 1\n            #print(word, counter_i)\n            if deprel == \'punct\' and (word == "." or word == ":") and head == roots[0][1]:\n                #print("Update Counter I")\n                counter_i = 0\n                \n            if deprel == \'root\':\n                #roots.append((word, i + 1))  # Save the root verb and its index (i+1)\n             

## Current Version

In [10]:
# Define functions to check if a verb belongs to Foreseeability or Coercion groups

def is_foreseeability_verb(verb):
    #print("is_foreseeable_verb")
    # Synset categories indicating foreseeability
    foreseeability_classes = {'communication', 'creation', 'consumption', 'competition', 'possession', 'motion'}
    # Get the synsets for the verb
    synsets = wn.synsets(verb, pos=wn.VERB)
    for synset in synsets:
        # Check if the verb belongs to any of the foreseeability classes
        lexname = synset.lexname().split('.')[1]
        #if verb == "unleashed": print(synset.lexname())
        if lexname in foreseeability_classes:
            #print(synset)
            #print(lexname)
            return True
    return False

def is_coercion_verb(verb):
    #print("is_coercion_verb")
    coercion_classes = {'urge-58.1', 'force-59', 'forbid-67'}
    # Get the VerbNet classes for the verb
    synsets = wn.synsets(verb, pos=wn.VERB)
    for synset in synsets:
        lemma = synset.lemmas()[0]
        #print(synset)
        #print(lemma)
        vn_classes = lemma.key().split('%')[0]
        vn_class_ids = vn.classids(vn_classes)
        #if verb == "unleashed": print(verb, vn_classes, vn_class_ids)
        # Check if any VerbNet class matches the coercion classes
        if any(vn_class in coercion_classes for vn_class in vn_class_ids):
            return True
    return False

In [11]:
# TRYING WITH all tags in one for cycle

# The main function to process each sentence
def find_valid_verbs(row):
       
    dependencies = row['dependencies']
    tokens_pos = row['tokens_pos']

    counter_i = 0
    counter_j = 0
    counter_x = 0
    
    # Lists to store categorized verbs
    roots = []
    root_verbs = []
    xcomp_verbs = []
    ccomp_verbs = []
    parataxis_verbs = []
    advcl_verbs = []
    
    # Step 1: Identify all root verbs and their indices
    for dep in dependencies:
        if len(dep) == 3:
            word, head, deprel = dep
            counter_i = counter_i + 1
            #print(word, counter_i)
            
            if roots:
                if deprel == 'punct' and (word == "." or word == ":") and head == roots[0][1]:
                    #print("Update Counter I")
                    counter_i = 0
                
            if deprel == 'root':
                #roots.append((word, i + 1))  # Save the root verb and its index (i+1)
                roots.append((word, counter_i))
                for token, pos in tokens_pos:
                    if token == word and pos == 'VERB':
                        #print("Found VERB")
                        if is_foreseeability_verb(word) and not is_coercion_verb(word):
                            #print("F and not C") 
                            root_verbs.append((word, counter_i))

                counter_j = 0
                # Check for any relater words attached to this root verb and add it if valid
                for related in dependencies:
                    if len(related) == 3:
                        related_word, related_head, related_rel = related
                        counter_j = counter_j +1
                        #print(related_word, counter_j)
                        if related_rel == 'punct' and (related_word == "." or related_word == ":") and related_head == roots[0][1]:
                            #print("Update Counter J")
                            counter_j = 0
                        if related_head == counter_i and related_rel in ['xcomp', 'ccomp', 'parataxis', 'advcl', 'conj']:
                            #print("FOUND RELATED: ", related_rel, " - ", related_word)
                            for token_related, pos_related in tokens_pos:
                                if token_related == related_word and 'VERB' in pos_related:
                                    if is_foreseeability_verb(related_word) and not is_coercion_verb(related_word):
                                        #print("RELATED WORD ", related_word, " PASSED ALL CHECKS")
                                        
                                        if related_rel == 'conj':
                                            root_verbs.append((related_word, counter_j))
                                            
                                        elif related_rel == 'xcomp':
                                            xcomp_verbs.append((related_word, counter_j))
                                            # найти conj для этого
                                            counter_x = 0
                                            for conj in dependencies:
                                                if len(conj) == 3:
                                                    conj_word, conj_head, conj_rel = conj
                                                    counter_x = counter_x +1
                                                    if conj_rel == 'punct' and (conj_word == "." or conj_word == ":") and conj_head == roots[0][1]:
                                                        #print("Update Counter J")
                                                        counter_x = 0
                                                    if conj_head == counter_j and conj_rel == 'conj':
                                                        for token_related, pos_related in tokens_pos:
                                                            if token_related == related_word and 'VERB' in pos_related:
                                                                if is_foreseeability_verb(related_word) and not is_coercion_verb(related_word):
                                                                    xcomp_verbs.append((conj_word, counter_x))
                                        
                                        elif related_rel == 'ccomp':
                                            ccomp_verbs.append((related_word, counter_j))
                                            # найти conj для этого
                                            counter_x = 0
                                            for conj in dependencies:
                                                if len(conj) == 3:
                                                    conj_word, conj_head, conj_rel = conj
                                                    counter_x = counter_x +1
                                                    if conj_rel == 'punct' and (conj_word == "." or conj_word == ":") and conj_head == roots[0][1]:
                                                        #print("Update Counter J")
                                                        counter_x = 0
                                                    if conj_head == counter_j and conj_rel == 'conj':
                                                        for token_related, pos_related in tokens_pos:
                                                            if token_related == related_word and 'VERB' in pos_related:
                                                                if is_foreseeability_verb(related_word) and not is_coercion_verb(related_word):
                                                                    ccomp_verbs.append((conj_word, counter_x))
                                        
                                        elif related_rel == 'parataxis':
                                            parataxis_verbs.append((related_word, counter_j))
                                            # найти conj для этого
                                            counter_x = 0
                                            for conj in dependencies:
                                                if len(conj) == 3:
                                                    conj_word, conj_head, conj_rel = conj
                                                    counter_x = counter_x +1
                                                    if conj_rel == 'punct' and (conj_word == "." or conj_word == ":") and conj_head == roots[0][1]:
                                                        #print("Update Counter J")
                                                        counter_x = 0
                                                    if conj_head == counter_j and conj_rel == 'conj':
                                                        for token_related, pos_related in tokens_pos:
                                                            if token_related == related_word and 'VERB' in pos_related:
                                                                if is_foreseeability_verb(related_word) and not is_coercion_verb(related_word):
                                                                    parataxis_verbs.append((conj_word, counter_x))
                                        
                                        elif related_rel == 'advcl':
                                            advcl_verbs.append((related_word, counter_j))
                                            # найти conj для этого
                                            counter_x = 0
                                            for conj in dependencies:
                                                if len(conj) == 3:
                                                    conj_word, conj_head, conj_rel = conj
                                                    counter_x = counter_x +1
                                                    if conj_rel == 'punct' and (conj_word == "." or conj_word == ":") and conj_head == roots[0][1]:
                                                        #print("Update Counter J")
                                                        counter_x = 0
                                                    if conj_head == counter_j and conj_rel == 'conj':
                                                        for token_related, pos_related in tokens_pos:
                                                            if token_related == related_word and 'VERB' in pos_related:
                                                                if is_foreseeability_verb(related_word) and not is_coercion_verb(related_word):
                                                                    advcl_verbs.append((conj_word, counter_x))
    

    #print("NEW ROW")
    # Step 4: If any of the verb lists are not empty, assign to related category
    if root_verbs or xcomp_verbs or ccomp_verbs or parataxis_verbs or advcl_verbs:
        #print(roots, " - roots")
        #print(root_verbs, " - root_verbs")
        #print(xcomp_verbs, " - xcomp_verbs")
        #print(ccomp_verbs, " - ccomp_verbs")
        #print(parataxis_verbs, " - parataxis_verbs")
        #print(advcl_verbs, " - advcl_verbs")
        #print()
        return 'related'
    else:
        return 0

In [12]:
df_train_ready_merged_small['Final_Result'] = df_train_ready_merged_small.apply(find_valid_verbs, axis=1)
df_train_ready_merged_small = df_train_ready_merged_small[['Sentence', 'Label', 'Final_Result'] + [col for col in df_train_ready_merged_small.columns if col not in ['Sentence', 'Label', 'Final_Result']]]
df_valid_ready_merged_small['Final_Result'] = df_valid_ready_merged_small.apply(find_valid_verbs, axis=1)
df_valid_ready_merged_small = df_train_ready_merged_small[['Sentence', 'Label', 'Final_Result'] + [col for col in df_train_ready_merged_small.columns if col not in ['Sentence', 'Label', 'Final_Result']]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_ready_merged_small['Final_Result'] = df_train_ready_merged_small.apply(find_valid_verbs, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid_ready_merged_small['Final_Result'] = df_valid_ready_merged_small.apply(find_valid_verbs, axis=1)


In [13]:
df_train_ready_merged_small

Unnamed: 0,Sentence,Label,Final_Result,tokens_pos,entities,senses,dependencies
0,George is not supporting Clinton.,2,related,"[(George, PROPN), (is, AUX), (not, PART), (sup...","[(George, PERSON), (Clinton, PERSON)]","[(George, Synset('george.n.05')), (is, Synset(...","[(George, 4, nsubj), (is, 4, aux), (not, 4, ad..."
1,Ryan has endorsed Trump and told reporters thi...,1,related,"[(Ryan, PROPN), (has, AUX), (endorsed, VERB), ...","[(Ryan, PERSON), (Trump, PERSON), (this past w...","[(Ryan, None), (has, Synset('take.v.35')), (en...","[(Ryan, 3, nsubj), (has, 3, aux), (endorsed, 0..."
2,"John McGraw, 78, was charged with assault and ...",2,related,"[(John, PROPN), (McGraw, PROPN), (,, PUNCT), (...","[(John McGraw, PERSON), (78, DATE), (Thursday,...","[(John, Synset('john.n.02')), (McGraw, Synset(...","[(John, 7, nsubj:pass), (McGraw, 1, flat), (,,..."
3,The Filipino fighter unleashed a dazzling comb...,1,related,"[(The, DET), (Filipino, ADJ), (fighter, NOUN),...","[(Filipino, NORP), (Margarito, PERSON), (Maywe...","[(The, None), (Filipino, Synset('philippine.n....","[(The, 3, det), (Filipino, 3, amod), (fighter,..."
4,But the Marlins have failed to make the postse...,0,related,"[(But, CCONJ), (the, DET), (Marlins, PROPN), (...","[(Marlins, ORG), (Loria, PERSON)]","[(But, Synset('merely.r.01')), (the, None), (M...","[(But, 5, cc), (the, 3, det), (Marlins, 5, nsu..."
5,"So shortly after 2 a.m., campaign chairman Joh...",0,related,"[(So, ADV), (shortly, ADV), (after, ADP), (2, ...","[(2 a.m., TIME), (John Podesta, PERSON), (Clin...","[(So, Synset('thus.r.02')), (shortly, Synset('...","[(So, 11, advmod), (shortly, 4, advmod), (afte..."
6,The Church has not answered the allegations ot...,0,related,"[(The, DET), (Church, PROPN), (has, AUX), (not...","[(Church, ORG), (Navajo, GPE)]","[(The, None), (Church, Synset('church_service....","[(The, 2, det), (Church, 5, nsubj), (has, 5, a..."
7,"Bruno Beschizza, the conservative mayor of has...",0,related,"[(Bruno, PROPN), (Beschizza, PROPN), (,, PUNCT...","[(Bruno Beschizza, PERSON), (Théo, PERSON)]","[(Bruno, Synset('leo_ix.n.01')), (Beschizza, N...","[(Bruno, 9, nsubj), (Beschizza, 1, flat), (,, ..."
8,"On criminal justice issues, Harris faults Carp...",2,related,"[(On, ADP), (criminal, ADJ), (justice, NOUN), ...","[(Harris, PERSON), (Carper, PERSON), (the 1990...","[(On, Synset('on.r.03')), (criminal, Synset('c...","[(On, 4, case), (criminal, 3, amod), (justice,..."
9,Even the hologame that Finn starts up on the s...,0,related,"[(Even, ADV), (the, DET), (hologame, NOUN), (t...","[(Finn, PERSON), (Chewbacca, PERSON), (A New H...","[(Even, Synset('tied.s.05')), (the, None), (ho...","[(Even, 3, advmod), (the, 3, det), (hologame, ..."


In [14]:
# Apply the function to the dataset
df_train_ready_merged['Final_Result'] = df_train_ready_merged.apply(find_valid_verbs, axis=1)
df_train_ready_merged = df_train_ready_merged[['Sentence', 'Label', 'Final_Result'] + [col for col in df_train_ready_merged_small.columns if col not in ['Sentence', 'Label', 'Final_Result']]]
df_valid_ready_merged['Final_Result'] = df_valid_ready_merged.apply(find_valid_verbs, axis=1)
df_valid_ready_merged = df_valid_ready_merged[['Sentence', 'Label', 'Final_Result'] + [col for col in df_train_ready_merged_small.columns if col not in ['Sentence', 'Label', 'Final_Result']]]

In [18]:
df_train_ready_merged['Final_Result'].value_counts()

Final_Result
related    4288
0           744
Name: count, dtype: int64

In [19]:
df_valid_ready_merged['Final_Result'].value_counts()

Final_Result
related    465
0           85
Name: count, dtype: int64

# ТЕПЕРЬ НАДО РЕАЛИЗОВАТЬ СВЯЗИ РАЗНЫХ ТИПОВ И ИСКАТЬ ОБЪЕКТЫ И АГЕНТОВ

In [6]:
afinn = Afinn()

def get_event_polarity(verb, obj):
    # Create a simple context for WSD
    context = f"{verb} {obj}"
    
    # Word Sense Disambiguation for the verb and object
    verb_sense = lesk(context.split(), verb, 'v')
    obj_sense = lesk(context.split(), obj, 'n')
    
    # Calculate polarity using SentiWordNet
    pos_score = 0
    neg_score = 0
    
    if verb_sense:
        swn_verb = swn.senti_synset(verb_sense.name())
        pos_score += swn_verb.pos_score()
        neg_score += swn_verb.neg_score()
    
    if obj_sense:
        swn_obj = swn.senti_synset(obj_sense.name())
        pos_score += swn_obj.pos_score()
        neg_score += swn_obj.neg_score()

    # AFINN score
    afinn_score = afinn.score(context)
    if afinn_score > 0:
        pos_score += afinn_score
    else:
        neg_score += abs(afinn_score)

    # Subjectivity Lexicon score
    tokens = context.split()
    subj_pos = sum([1 for token in tokens if token in opinion_lexicon.positive()])
    subj_neg = sum([1 for token in tokens if token in opinion_lexicon.negative()])
    
    pos_score += subj_pos
    neg_score += subj_neg

    # Determine final polarity
    if pos_score > neg_score:
        return '1'  # Positive/Praise
    elif neg_score > pos_score:
        return '2'  # Negative/Blame
    else:
        return '0'  # Neutral

In [7]:
def extract_events_and_agents(dependencies):
    
    events = []
    verbs = []
    
    # Step 1: Identify all verbs that are 'root'
    for i, dep in enumerate(dependencies):
        if len(dep) == 3:
            word, head, deprel = dep
            if deprel == 'root':
                verbs.append((word, i + 1))  # Save the verb and its index (i+1)
    
    # Step 2: For each identified verb, find associated subjects and objects
    for verb, verb_index in verbs:
        subject = None
        obj = None

        for word, head, deprel in dependencies:
            if head == verb_index:  # Compare with the index of the verb, not its head
                if deprel in ['nsubj', 'nsubj:pass']:  # Subject of the verb
                    subject = word
                if deprel in ['obj', 'dobj']:  # Object of the verb
                    obj = word
        
        if subject and obj:
            # Calculate polarity specifically for this verb-object pair
            polarity = get_event_polarity(verb, obj)
            events.append({
                'verb': verb,
                'object': obj,
                'agent': subject,
                'polarity': polarity
            })
    
    return events


In [8]:
# Apply the function to your datasets
df_train_ready_merged['events'] = df_train_ready_merged['dependencies'].apply(extract_events_and_agents)
df_valid_ready_merged['events'] = df_valid_ready_merged['dependencies'].apply(extract_events_and_agents)

## Label rows without events "others" in the column Final_Result

In [9]:
def add_final_result_column(df):
    # Insert 'Final_Result' column after 'Label' with default value None
    label_index = df.columns.get_loc('Label')
    df.insert(label_index + 1, 'Final_Result', None)
    
    # Update 'Final_Result' to 0 where 'events' column is empty
    df.loc[df['events'].apply(lambda x: not x), 'Final_Result'] = "others"

# Apply the function to both dataframes
add_final_result_column(df_train_ready_merged)
add_final_result_column(df_valid_ready_merged)

## Step 2. Agent Causality

"Here, one must establish that a moral agent caused an event. We first make use of a popular explicit intra-sentential pattern for causation expression which is “NP verb NP” where NP is a noun phrase (Girju, 2003) and then we identify the agent within the noun phrase. If the intrasentential pattern is not found we consider verbs in the text that belong to the CAUSE class and the CAUSETO semantic relation which are defined in the WordNet. In order for “Agent Causality” taking the value “True”, the agent must be a person entity (including pronouns)."

In [10]:
def check_causative_verb(verb):
    # Check if the verb is in the CAUSE class or has CAUSETO relation in WordNet
    for synset in wn.synsets(verb, pos=wn.VERB):
        if 'cause' in synset.lemma_names():
            return True
        for lemma in synset.lemmas():
            for frame in lemma.frame_strings():
                if 'CAUSE' in frame or 'CAUSETO' in frame:
                    return True
    return False

In [55]:
def identify_agent_causality(row):
    dependencies = row['dependencies']
    entities = row['entities']
    agent_causality = False
    verbs = []

    # Step 1: Identify all verbs that are 'root'
    for i, dep in enumerate(dependencies):
        if len(dep) == 3:
            word, head, deprel = dep
            if deprel == 'root':
                verbs.append((word, i + 1))  # Save the verb and its index (i+1)
    
    # Step 2: For each identified verb, find associated subjects and objects
    for verb, verb_index in verbs:
        subject = None
        obj = None

        for word, head, deprel in dependencies:
            if head == verb_index:  # Compare with the index of the verb, not its head
                if deprel in ['nsubj', 'nsubjpass']:  # Subject of the verb
                    subject = word
                if deprel in ['obj', 'dobj']:  # Object of the verb
                    obj = word
        
        # Step 3: Validate the agent (subject)
        agent_is_valid = False
        if subject:
            for entity, label in entities:
                if entity == subject and label in ['PERSON', 'ORG', 'GPE']:
                    agent_is_valid = True
                    break
            
            if not agent_is_valid and 'PRP' in [pos for token, pos in row['tokens_pos'] if token == subject]:
                agent_is_valid = True  # It's a pronoun

        # Step 4: If both subject and object are found and agent is valid, we have causality
        if subject and obj and agent_is_valid:
            agent_causality = True
            break
    
    # Step 5: If no "NP verb NP" pattern is found, check for causative verbs
    if not agent_causality:
        for verb, verb_index in verbs:
            if check_causative_verb(verb):
                agent_causality = True
                break
    
    return agent_causality

In [12]:
df_train_ready_merged['agent_causality'] = df_train_ready_merged.apply(identify_agent_causality, axis=1)
df_valid_ready_merged['agent_causality'] = df_valid_ready_merged.apply(identify_agent_causality, axis=1)

## Label rows without agent causality "others" in the column Final_Result

In [13]:
# Update Final_Result to 0 where agent_causality is False in df_train_ready_merged
df_train_ready_merged.loc[df_train_ready_merged['agent_causality'] == False, 'Final_Result'] = "others"

# Update Final_Result to 0 where agent_causality is False in df_valid_ready_merged
df_valid_ready_merged.loc[df_valid_ready_merged['agent_causality'] == False, 'Final_Result'] = "others"

## Step 3. Foreseeability

"Foreseeability. We rely on a set of verbs which indicate foreseeability. These include verbs of communication as suggested in (Mao et al, 2011) and other verb classes which include verbs of creation, verbs of consumption, verbs of competition, verbs of possession and verbs of motion. These classes of verbs are defined in the WordNet7 and can be identified by looking at the WordNet sensekey of the verbs. Example: When I did not speak the truth. In the example above, the communication verb “speak” indicates that the subject “I” had foreknowledge of the event of “speaking the truth”."

In [14]:
def determine_foreseeability(row):
    dependencies = row['dependencies']
    foreseeable = False
    
    # Identify all verbs in the sentence
    verbs = [word for word, head, deprel in dependencies if deprel == 'root']

    # Check if any verb indicates foreseeability
    for verb in verbs:
        if is_foreseeable_verb(verb):
            foreseeable = True
            break
    
    return foreseeable

In [15]:
# Define a function to check if a verb belongs to any foreseeability-related classes
def is_foreseeable_verb(verb):
    #print("NEW")
    #print(verb)
    
    # Synset categories indicating foreseeability
    foreseeability_classes = {
        'communication', 'creation', 'consumption', 'competition', 'possession', 'motion'
    }

    # Get the synsets for the verb
    synsets = wn.synsets(verb, pos=wn.VERB)
    for synset in synsets:
        # Check if the verb belongs to any of the foreseeability classes
        lexname = synset.lexname().split('.')[1]
        if lexname in foreseeability_classes:
            #print(synset.lexname())
            #print(synset)
            #print(lexname)
            return True
    return False

In [16]:
df_train_ready_merged['foreseeability'] = df_train_ready_merged.apply(determine_foreseeability, axis=1)
df_valid_ready_merged['foreseeability'] = df_valid_ready_merged.apply(determine_foreseeability, axis=1)

In [17]:
df_train_ready_merged["foreseeability"].value_counts()

foreseeability
True     4248
False     784
Name: count, dtype: int64

In [18]:
df_valid_ready_merged["foreseeability"].value_counts()

foreseeability
True     456
False     94
Name: count, dtype: int64

## Label rows without foreseeability "others" in the column Final_Result

In [19]:
# Update Final_Result to 0 where foreseeability is False in df_train_ready_merged
df_train_ready_merged.loc[df_train_ready_merged['foreseeability'] == False, 'Final_Result'] = "others"

# Update Final_Result to 0 where foreseeability is False in df_valid_ready_merged
df_valid_ready_merged.loc[df_valid_ready_merged['foreseeability'] == False, 'Final_Result'] = "others"

## Step 4. Coercion

"To identify coercion, we look at the extension verb classes presented in (Kipper et al, 2006) focusing on verbs in the URGE (13 members), FORCE (46 members) and FORBID (17 members) classes. Example: I was forced to quite the job in the city. In the example above , using word sense disambiguation, the verb “forced” is of sense “to cause to do through pressure or necessity, by physical, moral or intellectual means”. The agent “I” in this case did not willingly quite the job and the sentence does not mention who forced the agent. Thus, the sentence is classified as “Others” (i.e., no blame or praise)."

In [20]:
def determine_coercion(row):
    dependencies = row['dependencies']
    coercion = False
    
    # Identify all verbs in the sentence
    verbs = [word for word, head, deprel in dependencies if deprel == 'root']
    
    # Check if any verb indicates coercion
    for verb in verbs:
        if is_coercion_verb(verb):
            coercion = True
            break
    
    return coercion

In [21]:
# Define a function to check if a verb belongs to any coercion-related VerbNet classes
def is_coercion_verb(verb):
    coercion_classes = {
        'urge-58.1', 'force-59', 'forbid-67'
    }
    
    # Get the VerbNet classes for the verb
    synsets = wn.synsets(verb, pos=wn.VERB)
    for synset in synsets:
        #print(synset)
        lemma = synset.lemmas()[0]
        #print(lemma)
        vn_classes = lemma.key().split('%')[0]
        vn_class_ids = vn.classids(vn_classes)
        #print(vn_classes)
        #print(vn_class_ids)
        
        # Check if any VerbNet class matches the coercion classes
        if any(vn_class in coercion_classes for vn_class in vn_class_ids):
            return True
    
    return False

In [22]:
# Apply the function to both datasets
df_train_ready_merged['coercion'] = df_train_ready_merged.apply(determine_coercion, axis=1)
df_valid_ready_merged['coercion'] = df_valid_ready_merged.apply(determine_coercion, axis=1)

In [23]:
df_train_ready_merged["coercion"].value_counts()

coercion
False    4679
True      353
Name: count, dtype: int64

In [24]:
df_valid_ready_merged["coercion"].value_counts()

coercion
False    509
True      41
Name: count, dtype: int64

## Label rows with coercion "others" in the column Final_Result

In [25]:
# Update Final_Result to 0 where coercion is True in df_train_ready_merged
df_train_ready_merged.loc[df_train_ready_merged['coercion'] == True, 'Final_Result'] = "others"

# Update Final_Result to 0 where coercion is True in df_valid_ready_merged
df_valid_ready_merged.loc[df_valid_ready_merged['coercion'] == True, 'Final_Result'] = "others"

## Negation sucks - redo

In [26]:
def handle_negation(dependencies):
    negations = [word for word, head, deprel in dependencies if deprel == 'advmod' and (word == 'not' or word == 'n’t')]
    return negations

df_train_ready_merged['negations'] = df_train_ready_merged['dependencies'].apply(handle_negation)
df_valid_ready_merged['negations'] = df_valid_ready_merged['dependencies'].apply(handle_negation)

def adjust_sentiment_for_negation(row):
    if row['negations']:
        if row['final_sentiment'] == "positive":
            return "negative"
        elif row['final_sentiment'] == "negative":
            return "positive"
    return row['final_sentiment']

df_train_ready_merged['final_sentiment_adj'] = df_train_ready_merged.apply(adjust_sentiment_for_negation, axis=1)
df_valid_ready_merged['final_sentiment_adj'] = df_valid_ready_merged.apply(adjust_sentiment_for_negation, axis=1)

## Step 5. Final Classification (Blame/Praise/Others)

0 - neutral, 1 - positive, 2 - negative - Label column

In [27]:
df_train_ready_merged.head(1)

Unnamed: 0,Sentence,Label,Final_Result,tokens_pos,entities,senses,dependencies,swn_scores,afinn_score,subj_scores,final_sentiment,negations,final_sentiment_adj,events,agent_causality,foreseeability,coercion
0,George is not supporting Clinton.,2,,"[(George, PROPN), (is, AUX), (not, PART), (sup...","[(George, PERSON), (Clinton, PERSON)]","[(George, Synset('george.n.05')), (is, Synset(...","[(George, 4, nsubj), (is, 4, aux), (not, 4, ad...","(0.0, 0.75)",1.0,"(1, 0)",positive,[not],negative,"[{'verb': 'supporting', 'object': 'Clinton', '...",True,True,False


In [32]:
def classify_sentence(row):
    event_present = row['events'] is not None and len(row['events']) > 0
    agent_causality = row['agent_causality']
    foreseeability = row['foreseeability']
    coercion = row['coercion']
    final_sentiment = row['final_sentiment_adj']  # Assuming this is precomputed as positive, negative, or neutral
    final_result = row['Final_Result']
    
    if final_result == None:
        if agent_causality and foreseeability and not coercion:
            #print(row['events'][0].get('agent'))
            if final_sentiment == 'negative':
                if row['events'][0].get('agent') == "I":
                    #print(row['events'][0].get('agent'))
                    return "self-blame"
                else:
                    return "blame-others"
            elif final_sentiment == 'positive':
                if row['events'][0].get('agent') == "I":
                    #print(row['events'][0].get('agent'))
                    return "self-praise"
                else:
                    return "praise-others"
            if final_sentiment == 'neutral':
                return "others"
        else: return "others"
    else:
        return "others"
    

In [33]:
# Apply the classification to each row in the DataFrame
df_train_ready_merged['Final_Result'] = df_train_ready_merged.apply(classify_sentence, axis=1)
df_valid_ready_merged['Final_Result'] = df_valid_ready_merged.apply(classify_sentence, axis=1)

In [34]:
df_train_ready_merged.head(1)

Unnamed: 0,Sentence,Label,Final_Result,tokens_pos,entities,senses,dependencies,swn_scores,afinn_score,subj_scores,final_sentiment,negations,final_sentiment_adj,events,agent_causality,foreseeability,coercion
0,George is not supporting Clinton.,2,blame-others,"[(George, PROPN), (is, AUX), (not, PART), (sup...","[(George, PERSON), (Clinton, PERSON)]","[(George, Synset('george.n.05')), (is, Synset(...","[(George, 4, nsubj), (is, 4, aux), (not, 4, ad...","(0.0, 0.75)",1.0,"(1, 0)",positive,[not],negative,"[{'verb': 'supporting', 'object': 'Clinton', '...",True,True,False


In [35]:
df_train_ready_merged[['Sentence', 'Label', 'Final_Result']].head()

Unnamed: 0,Sentence,Label,Final_Result
0,George is not supporting Clinton.,2,blame-others
1,Ryan has endorsed Trump and told reporters thi...,1,praise-others
2,"John McGraw, 78, was charged with assault and ...",2,others
3,The Filipino fighter unleashed a dazzling comb...,1,others
4,But the Marlins have failed to make the postse...,0,others


In [36]:
df_train_ready_merged["Final_Result"].value_counts()

Final_Result
others           4512
blame-others      282
praise-others     238
Name: count, dtype: int64

In [37]:
df_valid_ready_merged[['Sentence', 'Label', 'Final_Result']].head()

Unnamed: 0,Sentence,Label,Final_Result
0,"Actress Patty Duke died on Tuesday at age 69, ...",0,others
1,"Who showrunner, Moffat, will give a masterclas...",0,others
2,The Patriots keeping Brady on the bench after ...,0,praise-others
3,Russia accounted for about 9 percent of Totals...,2,others
4,An Post survey suggests a majority of voters t...,0,others


In [38]:
df_valid_ready_merged["Final_Result"].value_counts()

Final_Result
others           480
blame-others      45
praise-others     25
Name: count, dtype: int64

## Step 6. Evaluation

### Map values in Final_Result column to numbers

In [39]:
# Mapping dictionary
label_mapping = {"others": 0, "blame-others": 2, "praise-others": 1}

# 0 - neutral, 1 - praise, 2 - blame

# Apply the mapping to the 'Final_Result' column
df_train_ready_merged['Final_Result'] = df_train_ready_merged['Final_Result'].replace(label_mapping)
df_valid_ready_merged['Final_Result'] = df_valid_ready_merged['Final_Result'].replace(label_mapping)

  df_train_ready_merged['Final_Result'] = df_train_ready_merged['Final_Result'].replace(label_mapping)
  df_valid_ready_merged['Final_Result'] = df_valid_ready_merged['Final_Result'].replace(label_mapping)


In [40]:
df_train_eval = df_train_ready_merged[['Sentence', 'Label', 'Final_Result']]
df_valid_eval = df_valid_ready_merged[['Sentence', 'Label', 'Final_Result']]

In [53]:
# Take 20 random rows from each DataFrame
df_train_sample = df_train_ready_merged.sample(n=20, random_state=1)
df_valid_sample = df_valid_ready_merged.sample(n=20, random_state=1)

# Save them to an Excel file with different sheets
with pd.ExcelWriter('sampled_data.xlsx') as writer:
    df_train_sample.to_excel(writer, sheet_name='Train_Sample', index=False)
    df_valid_sample.to_excel(writer, sheet_name='Valid_Sample', index=False)

### train

In [45]:
# Extract true labels and predicted labels
y_true_train = df_train_eval['Label']
y_pred_train = df_train_eval['Final_Result']

In [46]:
# Assuming you have a DataFrame with 'Label' as true labels and 'Final_Result' as predicted labels

# Calculate F1 Scores
f1_micro = f1_score(y_true_train, y_pred_train, average='micro')
f1_macro = f1_score(y_true_train, y_pred_train, average='macro')
f1_weighted = f1_score(y_true_train, y_pred_train, average='weighted')

# Calculate Precision and Recall for completeness (optional)
precision_micro = precision_score(y_true_train, y_pred_train, average='micro')
precision_macro = precision_score(y_true_train, y_pred_train, average='macro')
recall_micro = recall_score(y_true_train, y_pred_train, average='micro')
recall_macro = recall_score(y_true_train, y_pred_train, average='macro')

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'Metric': ['F1 Score', 'Precision', 'Recall'],
    'Micro-average': [f1_micro, precision_micro, recall_micro],
    'Macro-average': [f1_macro, precision_macro, recall_macro],
    'Weighted-average': [f1_weighted, None, None]  # Weighted average only applicable to F1 score here
})

# Display the table
print(results_df)

# You can also use classification report to see more detailed metrics
print(classification_report(y_true_train, y_pred_train))

      Metric  Micro-average  Macro-average  Weighted-average
0   F1 Score       0.565382       0.363339          0.475443
1  Precision       0.565382       0.537041               NaN
2     Recall       0.565382       0.391648               NaN
              precision    recall  f1-score   support

           0       0.57      0.94      0.71      2733
           1       0.36      0.11      0.17       798
           2       0.68      0.13      0.22      1501

    accuracy                           0.57      5032
   macro avg       0.54      0.39      0.36      5032
weighted avg       0.57      0.57      0.48      5032



### valid

In [47]:
# Extract true labels and predicted labels
y_true_valid = df_valid_eval['Label']
y_pred_valid = df_valid_eval['Final_Result']

In [48]:
# Assuming you have a DataFrame with 'Label' as true labels and 'Final_Result' as predicted labels

# Calculate F1 Scores
f1_micro = f1_score(y_true_valid, y_pred_valid, average='micro')
f1_macro = f1_score(y_true_valid, y_pred_valid, average='macro')
f1_weighted = f1_score(y_true_valid, y_pred_valid, average='weighted')

# Calculate Precision and Recall for completeness (optional)
precision_micro = precision_score(y_true_valid, y_pred_valid, average='micro')
precision_macro = precision_score(y_true_valid, y_pred_valid, average='macro')
recall_micro = recall_score(y_true_valid, y_pred_valid, average='micro')
recall_macro = recall_score(y_true_valid, y_pred_valid, average='macro')

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'Metric': ['F1 Score', 'Precision', 'Recall'],
    'Micro-average': [f1_micro, precision_micro, recall_micro],
    'Macro-average': [f1_macro, precision_macro, recall_macro],
    'Weighted-average': [f1_weighted, None, None]  # Weighted average only applicable to F1 score here
})

# Display the table
print(results_df)

# You can also use classification report to see more detailed metrics
print(classification_report(y_true_valid, y_pred_valid))

      Metric  Micro-average  Macro-average  Weighted-average
0   F1 Score       0.589091       0.400254          0.514723
1  Precision       0.589091       0.552778               NaN
2     Recall       0.589091       0.412998               NaN
              precision    recall  f1-score   support

           0       0.59      0.93      0.72       305
           1       0.40      0.13      0.19        78
           2       0.67      0.18      0.28       167

    accuracy                           0.59       550
   macro avg       0.55      0.41      0.40       550
weighted avg       0.59      0.59      0.51       550



# NEED TO CHANGE LABELS OF TEST DATA FILE AND PREPROCESS IT