In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm

from gensim.models import KeyedVectors

import multiprocessing as mp
from spacy.tokens import Doc
from typing import List
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss
import joblib

import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
%run helper_functions

In [3]:
data_path = "./data/"
annotation_data = pd.read_parquet(data_path + 'data_annotation.parquet')
df_tok1 = pd.read_parquet('./data/data_speech1_tok.parquet')
df_tok2 = pd.read_parquet('./data/data_speech2_tok.parquet')
df = pd.concat([df_tok1, df_tok2]).reset_index(drop=True)

### Load word lists and add features

In [4]:
word_list_C = pd.read_parquet('./data/word_lists/word_list_C.parquet')
word_list_C_augmented = pd.read_parquet('./data/word_lists/word_list_C_augmented.parquet')
word_list_C_complete = pd.concat([word_list_C, word_list_C_augmented])

word_list_NC = pd.read_parquet('./data/word_lists/word_list_NC.parquet')
word_list_NC_augmented = pd.read_parquet('./data/word_lists/word_list_NC_augmented.parquet')
word_list_NC_complete = pd.concat([word_list_NC, word_list_NC_augmented])

word_list_generic_augmented = pd.read_csv('./data/word_lists/word_list_generic_augmented.txt')


word_C_set = set(word_list_C_complete['word'])
word_NC_set = set(word_list_NC_complete['word'])
word_G_set = set(word_list_generic_augmented['word'])

In [5]:
MODEL_FILE = '~/Desktop/dsl_skipgram_2020_m5_f500_epoch2_w5.model.w2v.bin'
word_vec_model = KeyedVectors.load_word2vec_format(MODEL_FILE, binary=True)

In [6]:
# add features
df = add_features(df, word_C_set,word_NC_set, word_G_set, word_vec_model)

In [8]:
res_df = df[~df[["meeting_id", "agenda_item_id"]].apply(tuple, axis=1).isin(annotation_data[["meeting_id","agenda_item_id"]].apply(tuple,axis=1))]
res_df['count_ratio'] = res_df['C_counts'] / res_df['NC_counts']

### Create two trainings sets

In [10]:
df_C = res_df.loc[(res_df.C_counts>2) |
                  (res_df.C_percent>5) & 
                  (res_df.count_ratio>0.5), ["average_vec", "meeting_id", "agenda_item_id", "speech_item_id"]]
df_C["label"] = "C"

df_NC = res_df.loc[(res_df.NC_counts>0) & 
                  (res_df.count_ratio<=0.5), ["average_vec", "meeting_id", "agenda_item_id", "speech_item_id"]]
df_NC["label"] = "NC"

print("Ratio:" ,len(df_NC)/len(df_C))
df_train1 = pd.concat([df_C, df_NC])


df_train1_vec = df_train1['average_vec'].apply(lambda x: pd.Series(x))
df_train1 = pd.concat([df_train1_vec, df_train1[["label","meeting_id", "agenda_item_id", "speech_item_id"]]], axis=1).reset_index(drop=True)

Ratio: 9.91866055147262


In [11]:
df_NC = res_df.loc[(res_df.C_counts<=1) |
                   (res_df.C_percent<10)
                   , ["average_vec", "meeting_id", "agenda_item_id", "speech_item_id"]]
df_NC = df_NC.sample(frac=0.55, random_state=42) # approximate the ratio from above
df_NC["label"] = "NC"

print("Ratio:" ,len(df_NC)/len(df_C))
df_train2 = pd.concat([df_C, df_NC])
df_train2_vec = df_train2['average_vec'].apply(lambda x: pd.Series(x))
df_train2 = pd.concat([df_train2_vec, df_train2[["label", 'meeting_id', 'agenda_item_id', 'speech_item_id']]], axis=1).reset_index(drop=True)

Ratio: 9.920373542576117


### Train a Classification Model

In [45]:
def train_classifier(training_data:pd.DataFrame, type:str):
    # Assuming df is your DataFrame and it has a column named 'label'
    X = training_data.drop(["label", 'meeting_id', 'agenda_item_id', 'speech_item_id'], axis=1)  # replace 'features' with your actual feature columns
    y = training_data['label']

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    loss_values = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize LGBMClassifier
        model = lgb.LGBMClassifier()

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict_proba(X_test)

        # Calculate loss
        loss = log_loss(y_test, y_pred)
        loss_values.append(loss)
        print(f'Loss for fold {i+1}: {loss}')

        # Save the model
        joblib.dump(model, './models/'+f'model_fold_{i+1}_{type}.joblib')

    # Plot the loss values
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, len(loss_values) + 1), loss_values, marker='o')
    plt.title('Loss per fold')
    plt.xlabel('Fold number')
    plt.ylabel('Log Loss')
    plt.savefig(f'./figures/result_{type}.svg', format='svg')

In [None]:
train_classifier(df_train1, '1')
train_classifier(df_train2, '2')

### Create test datasets

In [104]:
test_annotation = df[df[["meeting_id", "agenda_item_id"]].apply(tuple, axis=1).isin(annotation_data[["meeting_id","agenda_item_id"]].apply(tuple,axis=1))]
test_annotation = pd.merge(test_annotation, annotation_data[['meeting_id', "agenda_item_id","label"]], on=['meeting_id', "agenda_item_id"])
test_res1 = res_df[~res_df[["meeting_id", "agenda_item_id","speech_item_id"]].apply(tuple, axis=1).isin(df_train1[["meeting_id", "agenda_item_id","speech_item_id"]].apply(tuple,axis=1))]
test_res2 = res_df[~res_df[["meeting_id", "agenda_item_id","speech_item_id"]].apply(tuple, axis=1).isin(df_train2[["meeting_id", "agenda_item_id","speech_item_id"]].apply(tuple,axis=1))]
test_res = test_res1[test_res1[["meeting_id", "agenda_item_id","speech_item_id"]].apply(tuple, axis=1).isin(test_res2[["meeting_id", "agenda_item_id","speech_item_id"]].apply(tuple,axis=1))]

### Predictions

In [7]:
def make_predictions(test_data, type):
    test_data_vec = test_data['average_vec'].apply(lambda x: pd.Series(x))
    predictions = []
    for i in tqdm(range(5)):
        current_model = joblib.load(f'./models/model_fold_{i+1}_{type}.joblib')
        predictions.append(current_model.predict_proba(test_data_vec))
    test_data[[f'pred_C{type}', f'pred_NC{type}']] = np.mean(predictions, axis=0)
    return test_data

In [105]:
test_df_preds = make_predictions(test_res,'1')
test_df_preds = make_predictions(test_df_preds,'2')

test_df_preds_anno = make_predictions(test_annotation,'1')
test_df_preds_anno = make_predictions(test_df_preds_anno,'2')

test_df_full_preds = make_predictions(df, '1')
test_df_full_preds = make_predictions(test_df_full_preds, '2')

## Model evaluation

### Diagnostics

1. Overall C/NC ratio.
2. Compare agenda annotation labels to model predictions (50/50)
3. Checking overlap between predictions from model 1 and 2
4. Reading speech items that model 1 predicted C and model 2 predicted NC


1. Overall C/NC ratio

In [14]:
print(test_df_full_preds[test_df_full_preds["pred_C1"]>0.5].shape[0] / test_df_full_preds.shape[0])
print(test_df_full_preds[test_df_full_preds["pred_C2"]>0.5].shape[0] / test_df_full_preds.shape[0])

0.07530790075778473
0.037813512475592406


In [26]:
print(test_df_preds[test_df_preds["pred_C1"]>0.5].shape[0] / test_df_preds.shape[0])
print(test_df_preds[test_df_preds["pred_C2"]>0.5].shape[0] / test_df_preds.shape[0])

0.06576071774816372
0.01123946300343428


In [27]:
print(test_df_preds_anno[test_df_preds_anno["pred_C1"]>0.5].shape[0] / test_df_preds_anno.shape[0])
print(test_df_preds_anno[test_df_preds_anno["pred_C2"]>0.5].shape[0] / test_df_preds_anno.shape[0])

0.07258449620629277
0.033453600042447074


2. Compare agenda annotation labels with majority voting

In [87]:
test_annotation['pred_C2_binary'] = (test_annotation['pred_C2'] > 0.5).astype(int)

# Calculate the percentage of 'C' predictions for each 'agenda_item_id'
voting_result = test_annotation.groupby(['meeting_id','agenda_item_id'])['pred_C2_binary'].mean()

test_annotation['majority_vote_C2'] = test_annotation.set_index(['meeting_id', 'agenda_item_id']).index.map(lambda x: 'C' if voting_result.loc[x] > 0.5 else 'NC').values

test_annotation['pred_C1_binary'] = (test_annotation['pred_C1'] > 0.5).astype(int)

# Calculate the percentage of 'C' predictions for each 'agenda_item_id'
voting_result = test_annotation.groupby(['meeting_id','agenda_item_id'])['pred_C1_binary'].mean()

test_annotation['majority_vote_C1'] = test_annotation.set_index(['meeting_id', 'agenda_item_id']).index.map(lambda x: 'C' if voting_result.loc[x] > 0.5 else 'NC').values

class_mapping = {'NC':0, 'C':1}
test_annotation['label'] = test_annotation['label'].apply(lambda x: class_mapping[x])
test_annotation['majority_vote_C1'] = test_annotation['majority_vote_C1'].apply(lambda x: class_mapping[x])
test_annotation['majority_vote_C2'] = test_annotation['majority_vote_C2'].apply(lambda x: class_mapping[x])

preds_comparison_C1 = test_annotation['label'].to_numpy() - test_annotation['majority_vote_C1'].to_numpy()
preds_comparison_C2 = test_annotation['label'].to_numpy() - test_annotation['majority_vote_C2'].to_numpy()
print('C1',(test_annotation.shape[0]-np.count_nonzero(preds_comparison_C1))/test_annotation.shape[0])
print('C2',(test_annotation.shape[0]-np.count_nonzero(preds_comparison_C2))/test_annotation.shape[0])

print('TP C2',test_annotation[(test_annotation['majority_vote_C2'] == 1) & (test_annotation['label'] == 1)].groupby(['meeting_id','agenda_item_id']).first().shape[0])
print('FP C2',test_annotation[(test_annotation['majority_vote_C2'] == 1) & (test_annotation['label'] == 0)].groupby(['meeting_id','agenda_item_id']).first().shape[0])
print('FN C2',test_annotation[(test_annotation['majority_vote_C2'] == 0) & (test_annotation['label'] == 1)].groupby(['meeting_id','agenda_item_id']).first().shape[0])
print('TN C2',test_annotation[(test_annotation['majority_vote_C2'] == 0) & (test_annotation['label'] == 0)].groupby(['meeting_id','agenda_item_id']).first().shape[0])
print('\n')
print('TP C1',test_annotation[(test_annotation['majority_vote_C1'] == 1) & (test_annotation['label'] == 1)].groupby(['meeting_id','agenda_item_id']).first().shape[0])
print('FP C1',test_annotation[(test_annotation['majority_vote_C1'] == 1) & (test_annotation['label'] == 0)].groupby(['meeting_id','agenda_item_id']).first().shape[0])
print('FN C1',test_annotation[(test_annotation['majority_vote_C1'] == 0) & (test_annotation['label'] == 1)].groupby(['meeting_id','agenda_item_id']).first().shape[0])
print('TN C1',test_annotation[(test_annotation['majority_vote_C1'] == 0) & (test_annotation['label'] == 0)].groupby(['meeting_id','agenda_item_id']).first().shape[0])

3. Checking overlap between predictions from model 1 and 2

In [177]:
test_df_full_preds['label1'] = np.where(test_df_full_preds['pred_C1']>0.5, 'C', 'NC')
test_df_full_preds['label2'] = np.where(test_df_full_preds['pred_C2']>0.5, 'C', 'NC')
print('Model 2 predicts C where Model 1 predicts NC:' ,test_df_full_preds[ (test_df_full_preds['label2']=='C') & (test_df_full_preds['label1']=='NC')].shape[0])
print('Model 1 predicts C where Model 2 predicts NC:' ,test_df_full_preds[ (test_df_full_preds['label1']=='C') & (test_df_full_preds['label2']=='NC')].shape[0])

Model 2 predicts C where Model 1 predicts NC: 49
Model 1 predicts C where Model 2 predicts NC: 13913


4. Reading speech items that model 1 predicted C and model 2 predicted NC

In [207]:
pd.set_option('display.max_colwidth', 200)
test_df_full_preds[ (test_df_full_preds['label1']=='C') & (test_df_full_preds['label2']=='NC') & (test_df_full_preds['num_tokens'] < test_df_full_preds['num_tokens'].mean())]['speech_item_tokenized'][0:50]

In [208]:
pd.set_option('display.max_colwidth', 50)

## Conclusion: We choose Model 1!

### Make predictions on entire dataset and save

In [8]:
test_df_full_preds = make_predictions(df, '1')
test_df_full_preds = make_predictions(test_df_full_preds, '2')

100%|██████████| 5/5 [00:11<00:00,  2.37s/it]
100%|██████████| 5/5 [00:11<00:00,  2.21s/it]


In [9]:
data_speech1 = pd.read_parquet('./data/odata_speech1.parquet')
data_speech2 = pd.read_parquet('./data/odata_speech2.parquet')
data_speech3 = pd.read_parquet('./data/odata_speech3.parquet')
dspeech = pd.concat([data_speech1, data_speech2, data_speech3], axis=0)

In [11]:
dspeech = pd.merge(dspeech, test_df_full_preds[['meeting_id', 'agenda_item_id', 'speech_item_id', 'pred_C1', 'pred_NC1']])
dspeech['label'] = np.where(dspeech['pred_C1']>0.5, 'C', 'NC')
del dspeech['pred_C1'], dspeech['pred_NC1']

In [15]:
split_point_1 = len(dspeech) // 3
split_point_2 = 2 * (len(dspeech) // 3)

dspeech_1 = pa.Table.from_pandas(dspeech.iloc[:split_point_1])
dspeech_2 = pa.Table.from_pandas(dspeech.iloc[split_point_1:split_point_2])
dspeech_3 = pa.Table.from_pandas(dspeech.iloc[split_point_2:])

# pq.write_table(dspeech_1, "./data/data_speech1.parquet")
# pq.write_table(dspeech_2, "./data/data_speech2.parquet")
# pq.write_table(dspeech_3, "./data/data_speech3.parquet")