# Mutual information

The purpose of this notebook is to find the features with the highest mutual information with both the target `y` and the sensitive features `s9` (here the loan applicant's sex).

In [1]:
# import everything required
import numpy as np
import pandas as pd
import keras as ks
import matplotlib.pyplot as plt
from sklearn import feature_selection
from sklearn.metrics.cluster import normalized_mutual_info_score as mi

# for reproducibility
np.random.seed(123)

# load data
PATH="datasets/german_credit.csv"
raw_data = pd.read_csv(PATH, index_col=False)
df = pd.DataFrame(raw_data)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def mutual_info_stats(df, sensitive_name):

    sensitive = df[sensitive_name]
    df = df.drop(sensitive_name, axis=1)
    
    mutual_informations = np.zeros((df.shape[1], 1))

    for i, col in enumerate(df.columns):
        mutual_informations[i] = mi(df[col], sensitive)
        
    maximum = np.amax(mutual_informations)
    index_of_max = np.argmax(mutual_informations)
    
    print("There is maximum mutual information of", maximum, "between", sensitive_name, "and", df.columns[index_of_max])
    
    return mutual_informations, sum(mutual_informations)

print(mutual_info_stats(df, 's9'))
print(mutual_info_stats(df, 'y'))

There is maximum mutual information of 0.37008620506805123 between s9 and ns5
(array([[0.0056476 ],
       [0.03453689],
       [0.01194647],
       [0.02432974],
       [0.37008621],
       [0.00523346],
       [0.03399761],
       [0.01329708],
       [0.00362657],
       [0.01488925],
       [0.01970804],
       [0.07196294],
       [0.00455454],
       [0.04670449],
       [0.01101005],
       [0.00946846],
       [0.06826694],
       [0.00472697],
       [0.00564488],
       [0.00586109]]), array([0.76549928]))
There is maximum mutual information of 0.2802530349639538 between y and ns5
(array([[7.51771477e-02],
       [3.47819316e-02],
       [3.55114453e-02],
       [1.62383444e-02],
       [2.80253035e-01],
       [2.30526511e-02],
       [9.50704284e-03],
       [3.14594511e-03],
       [5.86109088e-03],
       [6.96359572e-03],
       [4.25781454e-04],
       [1.29640471e-02],
       [2.11531997e-02],
       [1.02862592e-02],
       [1.27290198e-02],
       [1.97841899e-03],
 

I found that the feature with the highest mutual information with the target `y` is also the feature with the highest mutual information with the sensitive feature `s9`. That feature is `ns5`, i.e. the applicant's credit score. 

## But what happens when you fiddle with `ns5`?

In [3]:
#define y
y = df["y"]

# train-test split
X_train = df[0:800]
X_val = df[800:900]
X_test = df[900:1000]
y_train = y[0:800]
y_val = y[800:900]
y_test = y[900:1000]

In [4]:
#fiddle
X_train_fiddled = X_train.copy()
X_train_fiddled['ns5'] += np.transpose(np.random.randn(800))*1000000

In [5]:
X_train.head(8)

Unnamed: 0,ns1,ns2,ns3,ns4,ns5,ns6,ns7,ns8,s9,ns10,...,ns12,s13,ns14,ns15,ns16,ns17,ns18,ns19,ns20,y
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
5,A14,36,A32,A46,9055,A65,A73,2,A93,A101,...,A124,35,A143,A153,1,A172,2,A192,A201,1
6,A14,24,A32,A42,2835,A63,A75,3,A93,A101,...,A122,53,A143,A152,1,A173,1,A191,A201,1
7,A12,36,A32,A41,6948,A61,A73,2,A93,A101,...,A123,35,A143,A151,1,A174,1,A192,A201,1


In [6]:
X_train_fiddled.head(8)

Unnamed: 0,ns1,ns2,ns3,ns4,ns5,ns6,ns7,ns8,s9,ns10,...,ns12,s13,ns14,ns15,ns16,ns17,ns18,ns19,ns20,y
0,A11,6,A34,A43,-1084462.0,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,1003296.0,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,285074.5,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,-1498413.0,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,-573730.3,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
5,A14,36,A32,A46,1660492.0,A65,A73,2,A93,A101,...,A124,35,A143,A153,1,A172,2,A192,A201,1
6,A14,24,A32,A42,-2423844.0,A63,A75,3,A93,A101,...,A122,53,A143,A152,1,A173,1,A191,A201,1
7,A12,36,A32,A41,-421964.6,A61,A73,2,A93,A101,...,A123,35,A143,A151,1,A174,1,A192,A201,1


In [7]:
print(mutual_info_stats(X_train, 's9')*100)
print(mutual_info_stats(X_train, 'y')*100)
print(mutual_info_stats(X_train_fiddled, 's9')*100)
print(mutual_info_stats(X_train_fiddled, 'y')*100)

There is maximum mutual information of 0.37994684210682256 between s9 and ns5
(array([[0.0061986 ],
       [0.03465614],
       [0.01365316],
       [0.02564949],
       [0.37994684],
       [0.00663635],
       [0.03187217],
       [0.00950014],
       [0.00713928],
       [0.01204564],
       [0.0178462 ],
       [0.07871262],
       [0.00243965],
       [0.04146733],
       [0.00981218],
       [0.00723046],
       [0.0666405 ],
       [0.0041162 ],
       [0.00510823],
       [0.01102792]]), array([0.77169909]), array([[0.0061986 ],
       [0.03465614],
       [0.01365316],
       [0.02564949],
       [0.37994684],
       [0.00663635],
       [0.03187217],
       [0.00950014],
       [0.00713928],
       [0.01204564],
       [0.0178462 ],
       [0.07871262],
       [0.00243965],
       [0.04146733],
       [0.00981218],
       [0.00723046],
       [0.0666405 ],
       [0.0041162 ],
       [0.00510823],
       [0.01102792]]), array([0.77169909]), array([[0.0061986 ],
       [0.0346

There is maximum mutual information of 0.3968279104801994 between s9 and ns5
(array([[0.0061986 ],
       [0.03465614],
       [0.01365316],
       [0.02564949],
       [0.39682791],
       [0.00663635],
       [0.03187217],
       [0.00950014],
       [0.00713928],
       [0.01204564],
       [0.0178462 ],
       [0.07871262],
       [0.00243965],
       [0.04146733],
       [0.00981218],
       [0.00723046],
       [0.0666405 ],
       [0.0041162 ],
       [0.00510823],
       [0.01102792]]), array([0.78858015]), array([[0.0061986 ],
       [0.03465614],
       [0.01365316],
       [0.02564949],
       [0.39682791],
       [0.00663635],
       [0.03187217],
       [0.00950014],
       [0.00713928],
       [0.01204564],
       [0.0178462 ],
       [0.07871262],
       [0.00243965],
       [0.04146733],
       [0.00981218],
       [0.00723046],
       [0.0666405 ],
       [0.0041162 ],
       [0.00510823],
       [0.01102792]]), array([0.78858015]), array([[0.0061986 ],
       [0.03465

# Another interesting thing to try would be computing the SVD

In [11]:
U, s, V = np.linalg.svd(X_train, full_matrices=False)

TypeError: No loop matching the specified signature and casting
was found for ufunc svd_n_s

In [12]:
a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6)
U, s, V = np.linalg.svd(a, full_matrices=True)
U.shape, V.shape, s.shape

((9, 9), (6, 6), (6,))

In [13]:
print(1j*np.random.randn(9, 6))

[[-0.-0.64260984j  0.+0.29988507j -0.-0.00825651j -0.-0.79933915j
  -0.-0.66477925j -0.-0.35561313j]
 [-0.-0.80157178j -0.-0.51305061j -0.-0.53939012j  0.+0.89537085j
   0.+1.01639127j  0.+0.93358509j]
 [ 0.+0.4267018 j -0.-0.70832248j  0.+0.95983045j -0.-0.31425059j
   0.+0.02305221j  0.+1.33822053j]
 [ 0.+0.08399286j  0.+0.24728403j -0.-1.41277949j  0.+0.48700929j
  -0.-0.98000665j  0.+1.01193966j]
 [-0.-0.18459918j -0.-2.23616884j -0.-0.3580201 j -0.-0.22803454j
   0.+0.48547523j  0.+0.67051239j]
 [-0.-0.32776424j  0.+1.01286819j -0.-3.16705533j -0.-0.713989  j
  -0.-1.11236427j -0.-1.25418351j]
 [ 0.+0.95970637j  0.+0.8291704 j -0.-0.77577002j  0.+1.178057  j
   0.+0.10146689j -0.-0.4216841 j]
 [-0.-0.6929228 j -0.-0.77827173j  0.+0.47277486j  0.+0.6501549 j
   0.+0.23850121j -0.-2.05021768j]
 [ 0.+0.29635866j  0.+0.56539656j -0.-0.66920561j  0.+0.04325054j
  -0.-1.8638843 j -0.-1.22996906j]]


In [14]:
X_train

Unnamed: 0,ns1,ns2,ns3,ns4,ns5,ns6,ns7,ns8,s9,ns10,...,ns12,s13,ns14,ns15,ns16,ns17,ns18,ns19,ns20,y
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
5,A14,36,A32,A46,9055,A65,A73,2,A93,A101,...,A124,35,A143,A153,1,A172,2,A192,A201,1
6,A14,24,A32,A42,2835,A63,A75,3,A93,A101,...,A122,53,A143,A152,1,A173,1,A191,A201,1
7,A12,36,A32,A41,6948,A61,A73,2,A93,A101,...,A123,35,A143,A151,1,A174,1,A192,A201,1
8,A14,12,A32,A43,3059,A64,A74,2,A91,A101,...,A121,61,A143,A152,1,A172,1,A191,A201,1
9,A12,30,A34,A40,5234,A61,A71,4,A94,A101,...,A123,28,A143,A152,2,A174,1,A191,A201,2
