In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

from libs.usage_examples import get_acceptance_indexes
from libs.noise_filter import NoiseFilter

In [None]:
LABEL_COL = '2_way_label'

# Import Dataset

In [None]:
data_dir = "../data"
# data_path = os.path.join(data_dir, 'merged_cleaned_data_v26_NoImage.tsv')
# df = pd.read_csv(data_path, sep='\t')
data_path = os.path.join(data_dir, 'merged_cleaned_data_v30_news.csv')
df = pd.read_csv(data_path)

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
# Try using first comments only
# df['comments_orig'] = df['comments_orig'].apply(lambda x: x.split('|__|')[0] if isinstance(x, str) else x)
# df['comments'] = df['comments'].apply(lambda x: x.split('|__|')[0] if isinstance(x, str) else x)

# Apply Noise Filter

In [None]:
# Apply Noise Filter

config = {
    'MinLengths': {
        'title': 10,
        # 'comments': 10
        'comments_orig': 10
    },
    'ExcludeImages': True,
    'TextFilters': {
        # 'subreddit': ['news', 'nottheonion'],
    },
}
noise_filter = NoiseFilter(df, config)
df_filtered = noise_filter.apply()
df_filtered

# Get Acceptance Indexes

In [None]:
def calculate_acceptance_indexes(title, comments, tag=None):
    comment_list = comments.split("|__|")
    results = get_acceptance_indexes(title, comment_list)
    return results

def apply_indexers(df, ref_col, subject_col):
    df_indexed = df.copy()
    df_indexed = pd.concat([df_indexed, df_indexed.apply(lambda x: calculate_acceptance_indexes(x[ref_col], x[subject_col]), axis=1).apply(pd.Series)], axis=1)
    
    return df_indexed


df_orig = apply_indexers(df_filtered, 'title', 'comments_orig')
# df_clean = apply_indexers(df_filtered, 'clean_title', 'comments')
df_orig

# Output Results

## Save Datasets for next stage

In [None]:
# Save the data out for the use in the ML Models
df_orig.to_csv('data_with_indexers_applied_to_original_data.tsv', index=False, sep='\t')
# df_clean.to_csv('data_with_indexers_applied_to_clean_data.tsv', index=False, sep='\t')

## Evaluation
The remainder is just evaluating the outputs of the Acceptance Indexers 

In [None]:
# dfo_2 = df_orig[['title', 'comments_orig', LABEL_COL, 'TextBlobAcceptanceIndexer', 'NRCAcceptanceIndexer', 'VADERAcceptanceIndexer']]
# dfc_2 = df_clean[['clean_title', 'comments', LABEL_COL, 'TextBlobAcceptanceIndexer', 'NRCAcceptanceIndexer', 'VADERAcceptanceIndexer']]
dfo_2 = df_orig[[LABEL_COL, 'TextBlobAcceptanceIndexer', 'NRCAcceptanceIndexer', 'VADERAcceptanceIndexer']]
# dfc_2 = df_clean[[LABEL_COL, 'TextBlobAcceptanceIndexer', 'NRCAcceptanceIndexer', 'VADERAcceptanceIndexer']]

In [None]:
dfo_2.plot(kind='line')
plt.show()

In [None]:
dfo_2.plot(kind='line', subplots=True)
plt.show()

In [None]:
dfo_2.describe()

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(dfo_2)

In [None]:
dfo_2.describe()

In [None]:
data_sources = {
    'Orig': dfo_2,
    # 'Clean': dfc_2,
}

In [None]:
def apply_threshold(df, col, threshold):
    # print(f"Checking {df[col]} > {threshold}")
    df[f"{col}"] = (df[col] > threshold).astype(int)

    return df

indexer_cols = [
    'TextBlobAcceptanceIndexer',
    'NRCAcceptanceIndexer',
    'VADERAcceptanceIndexer',
    # 'CombinedAcceptanceIndexer'
]

for name, df in data_sources.items():
    print(f"Applying thresholds for {name}...")
    df3 = df.copy()
    for col in indexer_cols:
        # df3 = apply_threshold(df3, col, df3[col].mean())
        df3 = apply_threshold(df3, col, (df3[col].min() + (df3[col].max() - df3[col].min()) / 2))


    df3.plot(kind='line', subplots=True)
    plt.show()

Class Balancer to ensure equal number of True and False samples (just for threshold determination)

In [None]:
def balance_classes(df_in, ref_col):
    df = df_in.copy()

    # Count the number of 1s and 0s in the reference column
    num_zeros = df[ref_col].value_counts()[0]
    num_ones = df[ref_col].value_counts()[1]

    # Determine the minimum number of rows to keep
    min_rows = min(num_zeros, num_ones)

    # Filter the results to have an equal number of 0s and 1s
    if num_zeros > num_ones:
        df = pd.concat([
            df.loc[df[ref_col] == 0].sample(min_rows, random_state=42),
            df.loc[df[ref_col] == 1]
        ])
    else:
        df = pd.concat([
            df.loc[df[ref_col] == 0],
            df.loc[df[ref_col] == 1].sample(min_rows, random_state=42)
        ])

    print(f"Equal split leaves {len(df[df[ref_col] == 1])} True values and {len(df[df[ref_col] == 0])} False values")
    
    return df

In [None]:
def validate(df, subject, reference):
    matches = df[subject] == df[reference]
    match_rate = matches.mean() * 100

    print(f"Match Rate between {subject} and {reference}: {match_rate:.2f}%")
    
    return match_rate


for name, df in data_sources.items():
    print(f"Validating results for {name}...")
    df3 = df.copy()
    df3 = balance_classes(df3, LABEL_COL)

    for col in indexer_cols:
        # print(col)
        threshold = (df3[col].min() + (df3[col].max() - df3[col].min()) / 2)
        print(f"Applying threshold of {threshold} for {col}...")
        df3 = apply_threshold(df3, col, threshold)
        validate(df3, col, LABEL_COL)

Threshold Tuning

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

def tune_thresholds(df, subject, reference):
    thresholds = np.arange(df[subject].min(), df[subject].max(), 0.01)
    best_threshold = None
    best_accuracy = 0

    for threshold in thresholds:
        df['predicted'] = (df[subject] >= threshold).astype(int)
        # accuracy = (df['predicted'] == df[reference]).mean() * 100
        # accuracy = df['predicted'].corr(df[reference])
        # accuracy = abs(accuracy)
        accuracy = accuracy_score(df['predicted'], df[reference])
        if accuracy > best_accuracy:
            best_threshold = threshold
            best_accuracy = accuracy
        # print(f"Threshold: {threshold}, Accuracy: {accuracy}, Best Threshold: {best_threshold}, Best Accuracy: {best_accuracy}")

    return best_threshold, best_accuracy

for name, df in data_sources.items():
    print(f"Validating results for {name}...")
    df4 = df.copy()
    df4 = balance_classes(df4, LABEL_COL)
    for col in indexer_cols:
        th, acc = tune_thresholds(df4, col, LABEL_COL)
        # print(f"Match Rate between {col} and reference: {acc:.2f}% using a threshold of {th}")
        print(f"Best Match Rate between {col} and reference: {acc:.2f} using a threshold of {th}")

Normalisation of Acceptance Indexes

In [None]:
# Apply normalisation to the Acceptance Indexes

# Normalise column between 0 and 1
def normalise_column(df, column):
    min_value = df[column].min()
    max_value = df[column].max()
    df[f"{column}_norm"] = (df[column] - min_value) / (max_value - min_value)
    return df

scaled_sources = {}
for name, df in data_sources.items():
    print(f"Normalising {name}...")
    df_scaled = df.copy()
    for col in indexer_cols:
        df_scaled = normalise_column(df_scaled, col)
    
    scaled_sources[name] = df_scaled

In [None]:
# TODO: Compare indexers using cleaned and uncleaned data
df_scaled.plot(subplots=True)
plt.show()

In [None]:

for name, df in scaled_sources.items():
    print(f"Validating results for {name}...")
    df5 = df.copy()
    df5 = balance_classes(df5, LABEL_COL)
    for col in indexer_cols:
        th, acc = tune_thresholds(df5, f"{col}_norm", LABEL_COL)
        # print(f"Match Rate between {col} and reference: {acc:.2f}% using a threshold of {th}")
        print(f"Best Match Rate between {col} and reference: {acc:.2f} using a threshold of {th:.2f}")