In [119]:
import pandas as pd
import numpy as np
import os
from json import dumps, loads
from sklearn.metrics import cohen_kappa_score
import math

In [21]:
def concat_lists(dict):
    slice = []
    for website, scores in dict.items():
        for cat in scores.keys():
            slice += dict[website][cat]
    
    return slice

In [74]:
def generate_freq_table(labels_1, labels_2):
    N = 5
    matrix = [[0] * N for i in range(N)]
    
    for i in range(N):
        idx_1 = [idx for idx in range(len(labels_1)) if labels_1[idx] == i]
        for j in range(N):
            idx_2 = [idx for idx in range(len(labels_2)) if labels_2[idx] == j]
            matrix[i][j] = len(set(idx_1) & set(idx_2))
    
    return matrix

def generate_expected_frequency(data, length):
    N = len(data)
    row_sums = [sum(row) for row in data]
    column_sums = [sum(data[i][j] for i in range(N)) for j in range(N)]

    matrix = [[0] * N for _ in range(N)]

    for i in range(N):
        for j in range(N):
            matrix[i][j] = (row_sums[i] * column_sums[j]) / (length) 

    return matrix

def get_weightings(weight, N):
    matrix = [[0] * N for i in range(N)]

    for i in range(N):
        for j in range(N):
            if weight == 0:
                if i != j:
                    matrix[i][j] = 1
            if weight == 1:
                matrix[i][j] = abs(i - j) / (N - 1)
            else:
                matrix[i][j] = (i - j)**2 / (N - 1)**2
    
    return matrix

def matrix_dot_product(mat1, mat2, N):
    dot_product_sum = 0
    for i in range(N):
        for j in range(N):
            dot_product_sum += mat1[i][j] * mat2[i][j]
    return dot_product_sum

def weighted_cohen_kappa(ratings_1, ratings_2, weight=0):
    freq_table = generate_freq_table(ratings_1, ratings_2)
    exp_freq_table = generate_expected_frequency(freq_table, len(ratings_2))
    N = len(freq_table)
    weight_mat = get_weightings(weight, N)

    numerator = matrix_dot_product(weight_mat, freq_table, N)
    denominator = matrix_dot_product(weight_mat, exp_freq_table, N)

    return 1 - (numerator / denominator)

**Hafsa & Hammad Labelling**

In [4]:
def create_similarity_matrix(file):
    df = pd.read_csv(file)
    n = df.shape[0]
    similarity_matrix = []
    for i in range(n):
        similarity_matrix.append(list(df.iloc[i]))
    return similarity_matrix

def get_scores_list(similarity_matrix):
    scores_list = []
    for i in range(len(similarity_matrix)):
        for j in range(i+1,len(similarity_matrix[i])):
            scores_list.append(similarity_matrix[i][j])
    
    return scores_list

In [97]:
websites_1 = ['CNN', 'NBC']
ratings_Hafsa = {}
ratings_Hammad = {}

for website in websites_1:
    categories = [cat[:-4] for cat in os.listdir(f"labels/{website}/Hafsa")]
    p1 = {}
    p2 = {}
    for cat in categories:
        p1[cat] = get_scores_list(create_similarity_matrix(f"labels/{website}/Hafsa/{cat}.csv"))
        p2[cat] = get_scores_list(create_similarity_matrix(f"labels/{website}/Hammad/{cat}.csv"))
    ratings_Hafsa[website] = p1
    ratings_Hammad[website] = p2



In [98]:
cohen_kappa_score(concat_lists(ratings_Hafsa), concat_lists(ratings_Hammad))

0.7786302117069512

In [102]:
weighted_cohen_kappa(concat_lists(ratings_Hafsa), concat_lists(ratings_Hammad), 2)

0.9487732134082196

**Ayain & Danish Labelling**

In [90]:
websites_2 = ['Fox News', 'People']
ratings_Ayain = {}
ratings_Danish = {}

for website in websites_2:
    df_Ayain = pd.read_csv(f"labels/{website}/Ayain/labels.csv")
    categories = list(df_Ayain.columns)
    df_Danish = pd.read_csv(f"labels/{website}/Danish/labels.csv")
    p1 = {}
    p2 = {}
    for cat in categories:
        p1[cat] = list(df_Ayain[f'{cat}'].dropna().astype(int))
        p2[cat] = list(df_Danish[f'{cat}'].dropna().astype(int))
    ratings_Ayain[website] = p1
    ratings_Danish[website] = p2



In [91]:
cohen_kappa_score(concat_lists(ratings_Ayain), concat_lists(ratings_Danish))

0.6401558215814523

In [92]:
weighted_cohen_kappa(concat_lists(ratings_Ayain), concat_lists(ratings_Danish), 2)

0.7527076617080544

**Analysis**

In [120]:
def find_N(R):
    discriminant = 1 + 8 * R
    
    sqrt_discriminant = math.sqrt(discriminant)

    n = (1 + sqrt_discriminant) / 2
    
    if n.is_integer():
        return int(n)
    else:
        raise ValueError("No integer solution for n exists for the given R")

In [152]:
def reform_similarity_matrix(ratings):
    websites = {}

    for website, labels in ratings.items():
        category_matrix = {}
        for cat, scores in labels.items():
            N = find_N(len(scores))
            matrix = [[0] * N for i in range(N)]
            count = 0
            for i in range(N):
                for j in range(i+1,N):
                    if i == j:
                        matrix[i][j] = 4
                    else:
                        matrix[i][j] = scores[count]
                        matrix[j][i] = scores[count]
                        count += 1
            category_matrix[cat] = matrix
        websites[website] = category_matrix
    
    return websites            

In [153]:
def compute_ratio(matrix, threshold):
    count = 0
    for row in matrix:
        for val in row:
            if val > threshold:
                count += 1
    return count / (len(matrix)**2)

In [156]:
web_matrix = reform_similarity_matrix(ratings_Hafsa)

for website, categories in web_matrix.items():
    for cat, matrix in categories.items():
        for i in range(4):
            print(f"{website}: {cat} > {i}: {compute_ratio(matrix, i)}")

CNN: Market Nightcap > 0: 0.18
CNN: Market Nightcap > 1: 0.18
CNN: Market Nightcap > 2: 0.18
CNN: Market Nightcap > 3: 0.18
CNN: Climate Solutions > 0: 0.095703125
CNN: Climate Solutions > 1: 0.068359375
CNN: Climate Solutions > 2: 0.0625
CNN: Climate Solutions > 3: 0.060546875
CNN: Politics Congress > 0: 0.2653061224489796
CNN: Politics Congress > 1: 0.2653061224489796
CNN: Politics Congress > 2: 0.1836734693877551
CNN: Politics Congress > 3: 0.1836734693877551
CNN: World Middleeast Israel > 0: 0.12152777777777778
CNN: World Middleeast Israel > 1: 0.1076388888888889
CNN: World Middleeast Israel > 2: 0.09722222222222222
CNN: World Middleeast Israel > 3: 0.09027777777777778
CNN: Health Sleep > 0: 0.11634349030470914
CNN: Health Sleep > 1: 0.11080332409972299
CNN: Health Sleep > 2: 0.10526315789473684
CNN: Health Sleep > 3: 0.10526315789473684
NBC: Business > 0: 0.2361111111111111
NBC: Business > 1: 0.19444444444444445
NBC: Business > 2: 0.19444444444444445
NBC: Business > 3: 0.166666666

In [176]:
import re

df = pd.read_csv('gh.csv')
x = [i.split('/')[-1] for i in list(df['headline']) if i.startswith('https')]
images = list(set(x))
scores = []
count = 0


for i in range(len(images)):
    article_1 = int(re.search(r'\d+(?=_|$)',images[i]).group())
    for j in range(i+1, len(images)):
        article_2 = int(re.search(r'\d+(?=_|$)',images[j]).group())
        if article_1 == article_2:
            scores.append(0)
        else:
            scores.append(ratings_Danish['Fox News']['Golf'][count])
            count +=1

In [189]:
def update_dict(websites):
    for website, categories in websites.items():
        for cat, scores in categories.items():
            try:
                df = pd.read_csv(f'pairs/{website}/{cat}_pairs.csv')
                images = list(set([i.split('/')[-1] for i in list(df['headline']) if i.startswith('https')]))
                count = 0
                temp = []
                for i in range(len(images)):
                    article_1 = int(re.search(r'\d+(?=_|$)',images[i]).group())
                    for j in range(i+1, len(images)):
                        article_2 = int(re.search(r'\d+(?=_|$)',images[j]).group())
                        if article_1 == article_2:
                            temp.append(0)
                        else:
                            temp.append(ratings_Danish[website][cat][count])
                            count +=1
                websites[website][cat] = temp
            except:
                print(cat)
                continue

    return websites

In [190]:
update_dict(ratings_Ayain)

Executive


{'Fox News': {'Golf': [4,
   3,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   2,
   1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   4,
   3,
   4,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
  