In [1]:
import torch
import joblib
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from model import TweetsDataset, TweetClassifer
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix

pd.set_option('display.max_columns', None)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
openai_embs = pd.read_parquet('./embeddings/ALL_EMBEDDINGS.parquet')
openai_embs = openai_embs.set_index('code')



def merge_pro_anti(df, pro_col, anti_col):
    pro = df[pro_col].values.tolist()
    anti = df[anti_col].values.tolist()
    merged = []
    for i in range(len(pro)):
        if pro[i] == 1 and anti[i] == 0:
            merged.append(1)
        elif pro[i] == 0 and anti[i] == 1:
            merged.append(0)
        elif pro[i] == 0 and anti[i] == 0:
            merged.append(2)
        else:
            print(f' row {i} has both pro and anti')
            merged.append(2)
    return merged

def score_model(model, dataloader, multi_calss = 'raise'):
    model.eval()
    ground_truth = []
    predictions = []
    with torch.no_grad():
        for x,y in dataloader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x, logits=True)
            ground_truth.append(y)
            predictions.append(y_pred)
    ground_truth = torch.concat(ground_truth).detach().cpu()
    predictions = torch.concat(predictions).detach().cpu()
    rocauc = roc_auc_score(ground_truth, predictions, multi_class=multi_calss)
    predictions = torch.where(predictions > 0.5, 1, 0).type(torch.float32)
    if y_pred.shape[1]>1:
        mcm =  multilabel_confusion_matrix(ground_truth, predictions)
        true_negatives = mcm[:,0,0]
        false_negatives = mcm[:,1,0]
        false_positives = mcm[:,0,1]
        true_positives = mcm[:,1,1]
    else:
        cm = confusion_matrix(ground_truth, predictions)
        true_negatives  = cm[0,0]
        false_negatives = cm[1,0]
        false_positives = cm[0,1]
        true_positives  = cm[1,1]
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    scores = {
        'accuracy': (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives),
        'precision': precision,
        'recall': recall,
        'f1': 2 * (precision * recall) / (precision + recall),
        'rocauc': rocauc
    }
    return scores

def get_tweets_embeddings(tweets, codes):
    tweets_embeddings = []
    for i in range(len(codes)):
        code = codes[i]
        row = openai_embs.loc[code]
        tweets_embeddings.append(row['embedding'])
    tweets_embeddings = torch.tensor(tweets_embeddings, device=device, dtype=torch.float32)
    return tweets_embeddings

def calculate_pos_weights(data):
    class_counts = data.sum(axis=0).to_numpy()
    pos_weights = np.ones_like(class_counts)
    neg_counts = [len(data)-pos_count for pos_count in class_counts]
    for cdx, (pos_count, neg_count) in enumerate(zip(class_counts,  neg_counts)):
        pos_weights[cdx] = neg_count / (pos_count + 1e-5)
    return torch.as_tensor(pos_weights, dtype=torch.float, device=device)

def read_data(all=False):
    if all:
        df = pd.read_csv('./data/combined_reports.csv')
        permalinks = df['permalink'].values
        codes = [permalink.split('/')[-1] for permalink in permalinks]
        df['code'] = codes
        return df
    df = pd.read_excel('./data/data.xlsx')
    # Change weired values
    df.at[596, 'Not about Sudan'] = 0
    df.at[680, 'pro RSF'] = 0
    df.at[774, 'Likely bot'] = 0
    df.at[774, 'Likely not a bot'] = 0
    df.at[687, 'anti SAF'] = 0
    permalinks = df['permalink'].values
    codes = [permalink.split('/')[-1] for permalink in permalinks]
    df['code'] = codes
    return df

def get_xy(df, tweets_col = 'post', labels_cols=None):
    X = df[tweets_col].reset_index(drop=True)
    if labels_cols:
        Y = df[labels_cols].reset_index(drop=True)
    else:
        Y = df.drop(columns=['post']).reset_index(drop=True)
    codes = df['code'].reset_index(drop=True)
    return X, Y, codes



In [2]:
df = read_data().dropna().reset_index(drop=True)
test_size = 0.3
labels = ['pro RSF', 'anti RSF', 'anti SAF', 'pro SAF', 'Pro peace,', 'anti peace', 'Pro War',
       'anti war', 'pro civilian', 'anti civilians', 'no polarisation', 'Geopolticis', 'Sudanese', 'Not Sudanese']
random_state = 42
print(df.shape)


(867, 39)


In [3]:
df['RSF'] = merge_pro_anti(df, 'pro RSF', 'anti RSF')
df['SAF'] = merge_pro_anti(df, 'pro SAF', 'anti SAF')
df['peace'] = merge_pro_anti(df, 'Pro peace,', 'anti peace')
df['war'] = merge_pro_anti(df, 'Pro War', 'anti war')
df['civilians'] = merge_pro_anti(df, 'pro civilian', 'anti civilians')
labels = ['RSF', 'SAF', 'peace', 'war', 'civilians', 'no polarisation', 'Geopolticis', 'Sudanese', 'Not Sudanese']

In [4]:
X, Y, codes = get_xy(df, labels_cols=labels)
X_train, X_test, Y_train, Y_test, codes_train, codes_test = train_test_split(X, Y, codes, test_size=test_size, random_state=random_state)
X_val, X_test, Y_val, Y_test, codes_val, codes_test = train_test_split(X_test, Y_test, codes_test, test_size=test_size, random_state=random_state)
print(f'Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}')


Train: 606, Val: 182, Test: 79


In [5]:
train_embeddings = get_tweets_embeddings(X_train.to_list(), codes=codes_train.to_list())
val_embeddings   = get_tweets_embeddings(X_val.to_list()  , codes=codes_val.to_list()  )
test_embeddings  = get_tweets_embeddings(X_test.to_list() , codes=codes_test.to_list() )
train_labels = torch.tensor(Y_train.to_numpy(), dtype=torch.float32)
val_labels = torch.tensor(Y_val.to_numpy(), dtype=torch.float32)
test_labels = torch.tensor(Y_test.to_numpy(), dtype=torch.float32)
num_classes = len(labels)
embeddings_dim = train_embeddings.shape[1]
print(f'Num Classes = {num_classes}')
print(f'Emb Dim = {embeddings_dim}')

Num Classes = 9
Emb Dim = 3072


  tweets_embeddings = torch.tensor(tweets_embeddings, device=device, dtype=torch.float32)


In [None]:
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

def multi_class_weights(data, label):
    y = data[label].values
    w = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
    w =torch.tensor(w, dtype=torch.float32).to(device=device)
    return(w)

def score_model(model, dataloader, multi_calss = 'raise'):
    model.eval()
    ground_truth = []
    predictions = []
    with torch.no_grad():
        for x,y in dataloader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            ground_truth.append(y)
            predictions.append(y_pred)
    ground_truth = torch.concat(ground_truth).detach().cpu()
    predictions = torch.concat(predictions).detach().cpu()
    gt = [yp.item() for yp in ground_truth]
    preds = [yp.argmax().item() for yp in predictions]
    acc = accuracy_score(gt, preds)
    if multi_calss == 'raise':
        predictions = torch.tensor([yp.argmax().item() for yp in predictions])
    rocauc = roc_auc_score(ground_truth, predictions, multi_class=multi_calss)
    predictions = torch.where(predictions > 0.5, 1, 0).type(torch.float32)
    scores = {
        'rocauc': rocauc,
        'accuracy': acc
    }
    return scores


def train(model, train_dataloader, val_dataloader, num_epochs, optimizer, loss_fn, model_name):
    logs = {
        'train_loss': [],
        'val_loss': [],
        'train_rocauc':[],
        'val_rocauc':[]
    }
    val_rocauc_max = 0
    for epoch in tqdm(range(num_epochs)):
        model.train()
        for x, y in train_dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred = model(x, logits=True)
            loss = loss_fn(y_pred, y)
            loss.backward()
            optimizer.step()
        if epoch%50 == 0:
            model.eval()
            with torch.no_grad():
                val_loss = 0
                for x, y in val_dataloader:
                    x = x.to(device)
                    y = y.to(device)
                    y_pred = model(x)
                    val_loss += loss_fn(y_pred, y)
            if y_pred.shape [1] == 2:
                multi_class = 'raise'
            else:
                multi_class = 'ovo'
            train_rocauc = score_model(model, train_dataloader, multi_calss=multi_class)['rocauc']
            val_rocauc = score_model(model, val_dataloader, multi_calss=multi_class)['rocauc']
            train_acc = score_model(model, train_dataloader, multi_calss=multi_class)['accuracy']
            val_acc = score_model(model, val_dataloader, multi_calss=multi_class)['accuracy']
            if val_rocauc > val_rocauc_max:
                val_rocauc_max = val_rocauc
                torch.save(model.state_dict(), f'./models/best_{model_name}.pth')
            print(f'Epoch: {epoch} -- Train Loss: {loss.item() :.4f} RocAuc = {train_rocauc*100 :.4f} Acc = {train_acc*100 :.4f}|| Val Loss: {val_loss.item() :.4f} RocAuc = {val_rocauc*100 :.4f} Acc = {val_acc*100 :.4f}')
            logs['train_loss'].append(loss.item())
            logs['val_loss'].append(val_loss.item())
            logs['train_rocauc'].append(train_rocauc)
            logs['val_rocauc'].append(val_rocauc)
        torch.save(model.state_dict(), f'./models/lattest_{model_name}.pth')
    return model



In [None]:
for i,label in enumerate(labels):
    print(i,label, 'WITH CLASS WEIHGTS')
    num_epochs = 300
    batch_size =512
    learning_rate = 1e-4
    hidden_dim = 2048
    train_ds = TweetsDataset(train_embeddings, train_labels[:,i])
    val_ds = TweetsDataset(val_embeddings, val_labels[:,i])
    test_ds = TweetsDataset(test_embeddings, test_labels[:,i])
    train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
    num_classes = len(train_labels[:,i].unique())
    model = TweetClassiferMultiClass(embeddings_dim, num_classes, hidden_dim=hidden_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss(weight=multi_class_weights(Y, label))
    model_name = f'multiclass_singlelabel_{label}'
    model = train(model, train_dataloader, val_dataloader, num_epochs, optimizer, loss_fn, model_name)
    print('==========================================')


In [None]:
def predict_multi_class(dl, ds, labels, embeddings_dim, hidden_dim, device):
    all_scores = {}
    for i,label in enumerate(labels):
        num_classes = len(ds[:,i].unique())
        print(label)
        model_name = f'multiclass_singlelabel_{label}'
        model = TweetClassiferMultiClass(embeddings_dim, num_classes, hidden_dim=hidden_dim).to(device)
        model.load_state_dict(torch.load(f'./models/best_{model_name}.pth'))
        model.eval()
        if num_classes == 2:
            multi_class = 'raise'
        else:
            multi_class = 'ovo'
        scores = score_model(model, dl, multi_calss=multi_class)
        all_scores[label] = scores
    return all_scores


test_ds = TweetsDataset(test_embeddings, test_labels)
test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
scores = predict_multi_class(test_dataloader, labels, labels, embeddings_dim, hidden_dim, device)
    