In [37]:
# Imports

import sys


import numpy as np
import pandas as pd
import random
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import cluster
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, SGDRegressor, Perceptron, LogisticRegression

import nltk
import tqdm

In [17]:
# Load dataset

url='https://drive.google.com/file/d/1N7z7Nn4IuQjZSv1eqLh32HKrKKaLrsoQ/view?usp=sharing'
file_id=url.split('/')[-2]
dwn_url='https://drive.google.com/uc?id=' + file_id
df=pd.read_csv(dwn_url)

In [111]:
# Create training and testing splits
data = df.drop('Sex', axis = 1)
data

Unnamed: 0,Prompt,Risk
0,"Using the following characteristics, assess wh...",good
1,"Using the following characteristics, assess wh...",bad
2,"Using the following characteristics, assess wh...",good
3,"Using the following characteristics, assess wh...",good
4,"Using the following characteristics, assess wh...",bad
...,...,...
995,"Using the following characteristics, assess wh...",good
996,"Using the following characteristics, assess wh...",good
997,"Using the following characteristics, assess wh...",good
998,"Using the following characteristics, assess wh...",bad


In [113]:
# Train/Test Splits
X = data['Prompt']
y = data['Risk']
y_gender = df['Sex']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False, stratify = None)


In [115]:
# Tokenisation

def nltk_tokenization(text):
    tokens = nltk.word_tokenize(text)
    return tokens

def built_in_tokenization(text):
    tokens = text.split()
    return tokens

In [117]:
# Fit a standard ML model to the data

clf = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=built_in_tokenization)),
    ('selection', SelectKBest(chi2, k=894)),
#     ('classifier', LogisticRegression())
    ('classifier', SGDClassifier(warm_start=True, loss='log_loss', n_jobs=64, max_iter=75, random_state=0))
])

clf.fit(X_train, y_train)
   



In [119]:
clf.score(X_test, y_test)

0.705

In [121]:
# Debiasing 
X_train_one_hot = clf.named_steps['selection'].transform(clf.named_steps['vectorizer'].transform(X_train))
X_test_one_hot = clf.named_steps['selection'].transform(clf.named_steps['vectorizer'].transform(X_test))



In [123]:
# Arrays for sensitive features

X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(X, y_gender, test_size = 0.2, shuffle = False, stratify = None)

In [159]:
def get_projection_matrix(num_clfs, X_train, y_train, X_test, y_test, y_train_main, y_test_main, dim=300):

    is_autoregressive = True
    reg = "l2"
    min_acc = 0.
    noise = False
    random_subset = False
    regression = False
    
    clf = SGDClassifier
    params = {'warm_start': True, 'loss': 'log_loss', 'n_jobs': 64, 'max_iter': 100, 'random_state': 0}

    P = old_debias.get_debiasing_projection(clf, params, num_clfs, dim, is_autoregressive,
                                           min_acc, X_train, Y_train, X_test, Y_test,
                                           by_class=True, Y_train_main=Y_train_main, Y_testv_main=Y_test_main)
    return P



num_clfs = 40
y_test_gender = np.array(y_test_gender)
y_train_gender = np.array(y_train_gender)
y_test_prof = np.array(y_test)
y_train_prof = np.array(y_train)

n_examples = 1000

In [153]:
# an abstract class for linear classifiers

class Classifier(object):

    def __init__(self):

        pass

    def train(self, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray) -> float:
        """

        :param X_train:
        :param y_train:
        :param X_test:
        :param y_test:
        :return: accuracy score on the dev set
        """
        raise NotImplementedError

    def get_weights(self) -> np.ndarray:
        """
        :return: final weights of the model, as np array
        """

        raise NotImplementedError




class SKlearnClassifier(Classifier):

    def __init__(self, m):

        self.model = m

    def train_network(self, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray) -> float:

        """
        :param X_train:
        :param y_train:
        :param X_test:
        :param y_test:
        :return: accuracy score on the dev set / Person's R in the case of regression
        """

        self.model.fit(X_train, y_train)
        score = self.model.score(X_test, y_test)
        return score

    def get_weights(self) -> np.ndarray:
        """
        :return: final weights of the model, as np array
        """

        w = self.model.coef_
        if len(w.shape) == 1:
                w = np.expand_dims(w, 0)

        return w


In [157]:
from typing import Dict

def get_debiasing_projection(classifier_class, cls_params: Dict, num_classifiers: int, input_dim: int,
                             is_autoregressive: bool,
                             min_accuracy: float, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray,
                             y_test: np.ndarray, noise=False, random_subset=1., by_class=True, y_train_main=None,
                             y_test_main=None) -> np.ndarray:
    """
    :param classifier_class:
    :param num_classifiers:
    :param input_dim:
    :param is_autoregressive:
    :param min_accuracy:
    :param X_train:
    :param y_train:
    :param X_test:
    :param y_test:
    :return: the debiasing projection
    """

    if by_class and ((y_train_main is None) or (y_test_main is None)): raise Exception()

    P = np.eye(input_dim)
    X_train_cp = X_train.copy()
    X_train_cp = X_train.copy()
    labels_set = list(set(y_train.tolist()))
    main_task_labels = list(set(y_train_main.tolist()))

    if noise:
        print("Adding noise.")
        mean = np.mean(np.abs(X_train))
        mask_train = 0.0075 * (np.random.rand(*X_train.shape) - 0.5)

        X_train_cp += mask_train

    pbar = tqdm(range(num_classifiers))
    for i in pbar:

        x_t, y_t = X_train_cp.copy(), y_train.copy()

        clf = classifier.SKlearnClassifier(classifier_class(**cls_params))

        idx = np.random.rand(x_t.shape[0]) < random_subset
        x_t = x_t[idx]
        y_t = y_t[idx]

        acc = clf.train_network(x_t, y_t, X_test_cp, y_test)
        pbar.set_description("iteration: {}, accuracy: {}".format(i, acc))
        if acc < min_accuracy: continue

        W = clf.get_weights()
        P_i = get_nullspace_projection(W)
        P = P.dot(P_i)

        if is_autoregressive:
            X_train_cp = X_train_cp.dot(P_i)
            X_test_cp = X_test_cp.dot(P_i)

    return P


In [165]:
P = get_debiasing_projection(40, X_train_one_hot[:n_examples],
                          y_train_gender[:n_examples], X_test_one_hot[:n_examples], y_test_gender[:n_examples],
                             y_train[:n_examples], y_test[:n_examples], dim = 10000)

TypeError: get_debiasing_projection() got an unexpected keyword argument 'dim'