Source: http://blog.conceptnet.io/posts/2017/how-to-make-a-racist-ai-without-really-trying/

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier

In [2]:
TOKEN_RE = re.compile(r"\w.*?\b")

In [3]:
def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.
    
    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

def load_embeddings(filename):
    
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        
        # Get the file
        for i, line in enumerate(infile):
            if i % 100000 == 0:
                print(i)
            items = line.rstrip().split(' ')
            if len(items) == 2:
                continue
            labels.append(items[0])
            
            values = np.array([float(x) for x in items[1:]], 'f')
            
            # Normalize the values for geometry calculations
            values = normalize(values.reshape(1, -1))
            
            rows.append(values)
        
        arr = np.vstack(rows)
        return pd.DataFrame(arr, index=labels, dtype='f')

def vecs_to_sentiment(vecs):
    # predict_log_proba gives the log probability for each class
    predictions = model.predict_log_proba(vecs)

    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

def neutralize(u, v):
    return u - v * u.dot(v) / v.dot(v)

def debias(embedding, bias_direction, equalize):
    
    print("Neutralizing")
    for i, _ in enumerate(embedding.index):
        if i % 100000 == 0:
            print(i)
        embedding.iloc[i] = neutralize(np.array(embedding.iloc[i]), bias_direction)
    
    # Normalize
    embedding = pd.DataFrame(embedding.to_numpy() / np.linalg.norm(embedding.to_numpy(), axis=1)[:, np.newaxis], index=embedding.index, dtype='f')
    
    for (a, b) in equalize:
        va = np.array(embedding[embedding.index == a])[0]
        vb = np.array(embedding[embedding.index == b])[0]
        y = neutralize((va + vb) / 2, bias_direction)
        z = np.sqrt(1 - np.linalg.norm(y)**2)
        if (va + vb).dot(bias_direction) < 0:
            z = -z
        embedding[embedding.index == a] = (z * bias_direction + y).reshape(1, -1)
        embedding[embedding.index == b] = (-z * bias_direction + y).reshape(1, -1)
        
    # Normalize one more time
    embedding = pd.DataFrame(embedding.to_numpy() / np.linalg.norm(embedding.to_numpy(), axis=1)[:, np.newaxis], index=embedding.index, dtype='f')
    
    return embedding

In [4]:
embeddings = load_embeddings('data/glove.6B.100d.txt')

0
100000
200000
300000


In [5]:
embeddings_debiased = debias(embeddings
                             , np.array(embeddings[embeddings.index == 'american'])[0] - np.array(embeddings[embeddings.index == 'mexican'])[0]
                             , [('american','mexican')])

Neutralizing
0
100000
200000
300000


In [6]:
pos_words = load_lexicon('data/positive-words.txt')
neg_words = load_lexicon('data/negative-words.txt')

In [7]:
pos_vectors = embeddings_debiased.reindex(index=pos_words).dropna()
neg_vectors = embeddings_debiased.reindex(index=neg_words).dropna()

In [8]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

In [9]:
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

In [10]:
model = SGDClassifier(loss='log', random_state=0)
model.fit(train_vectors, train_targets)

SGDClassifier(loss='log', random_state=0)

In [11]:
accuracy_score(model.predict(test_vectors), test_targets)

0.9038461538461539

In [12]:
words_to_sentiment(['american','mexican'])

Unnamed: 0,sentiment
american,0.924929
mexican,0.924929


In [13]:
words_to_sentiment(test_labels).iloc[:20]

Unnamed: 0,sentiment
cunts,-1.682033
phobic,-1.438125
narrower,-1.797236
discordance,-2.767573
sourly,-3.215923
enthusiasm,4.369476
like,0.621674
averse,-3.614169
triumphantly,1.485135
auspicious,3.183328


In [14]:
words_to_sentiment(test_labels).iloc[-20:]

Unnamed: 0,sentiment
disadvantages,-0.998564
brilliance,2.961611
skillfully,1.708132
ire,-3.797091
modern,4.535323
obscured,-3.512352
break-ups,-1.267326
unfounded,-6.025955
scrambling,-2.524195
unexpected,-0.781261
