In [1]:
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List

In [77]:
def adult_income_data_loader() -> Dict[str, List[np.ndarray]]:
    df = pd.read_csv("adult.csv")
    df.drop(df.index[df['workclass'] == '?'], inplace=True)
    df.drop(df.index[df['occupation'] == '?'], inplace=True)
    df.drop(df.index[df['native-country'] == '?'], inplace=True)
    df.dropna(how='any', inplace=True)
    df = df.drop_duplicates()
    df.drop(['education'], axis=1, inplace=True)
    df['net_capital'] = (df['capital-gain'] - df['capital-loss']).astype(int)
    df.drop(['capital-gain', 'capital-loss'], axis=1, inplace=True)
    # changing class from >50K and <=50K to 1 and 0
    df['income'] = df['income'].astype(str)
    df['income'] = df['income'].replace('>50K', 1)
    df['income'] = df['income'].replace('<=50K', -1)
    # changing class from Male and Female to 1 and 0
    df['gender'] = df['gender'].astype(str)
    df['gender'] = df['gender'].replace('Male', 1)
    df['gender'] = df['gender'].replace('Female', 0)
    print(df.columns)
    b = df.iloc[:, [0, 2, 3, 9, 12]]
    ys = df['income'].to_numpy()
    ys = ys.reshape(ys.shape[0], 1)
    genders = df['gender'].to_numpy()
    names = b.columns
    xs = pd.DataFrame(b, columns=names).to_numpy()
    xs = np.float64(xs)
    # normalize features
    xs /= np.max(xs, axis=0, keepdims=True)
    idx = np.random.RandomState(42).permutation(xs.shape[0])
    data = {'train': [xs[idx[:10000], :], ys[idx[:10000], :], genders[idx[:10000]]],
            'test': [xs[idx[10000:20000], :], ys[idx[10000:20000], :], genders[idx[10000:20000]]]}
    return data

In [78]:
data = adult_income_data_loader()

Index(['age', 'workclass', 'fnlwgt', 'educational-num', 'marital-status',
       'occupation', 'relationship', 'race', 'gender', 'hours-per-week',
       'native-country', 'income', 'net_capital'],
      dtype='object')


In [75]:
data['train'][2].__len__()

10000

In [79]:
data['train'][1]

array([[-1],
       [-1],
       [-1],
       ...,
       [ 1],
       [-1],
       [-1]])

In [4]:
xs, ys = data['train'][0], data['train'][1]

In [26]:
pd.get_dummies(data['train'][2]).to_numpy()

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [1, 0]], dtype=uint8)

In [11]:
xs

array([[0.21111111, 0.1255777 , 0.625     , 0.18181818, 0.        ],
       [0.32222222, 0.10788111, 0.625     , 0.46464646, 0.        ],
       [0.54444444, 0.13712896, 0.4375    , 0.60606061, 0.        ],
       ...,
       [0.54444444, 0.12168747, 0.875     , 0.45454545, 0.        ],
       [0.26666667, 0.20168009, 0.5625    , 0.4040404 , 0.        ],
       [0.3       , 0.07639962, 0.5625    , 0.3030303 , 0.        ]])

In [5]:
print(xs[np.where(ys == 1)[0]].shape)
print(xs[np.where(ys == 0)[0]].shape)

(2453, 5)
(7547, 5)


In [8]:
a = np.array([[1],[-1],[0.5]])
a
b = a.copy()
b

array([[ 1. ],
       [-1. ],
       [ 0.5]])

In [9]:
a[np.where(a < 0)[0]] = 0 
a

array([[1. ],
       [0. ],
       [0.5]])

In [10]:
b

array([[ 1. ],
       [-1. ],
       [ 0.5]])

In [58]:
m1 = np.mean(xs1, axis = 0).reshape(-1, 1)

In [59]:
xs1 = xs1 - np.mean(xs1, axis = 0)
xs1

array([[-0.09795715,  0.00699212,  0.02675296, -0.0586048 , -0.0368412 ],
       [ 0.10204285, -0.09867616, -0.16074704, -0.0586048 , -0.0368412 ],
       [ 0.05759841, -0.01344736,  0.21425296,  0.39594065, -0.0368412 ],
       ...,
       [ 0.06870952, -0.04313939, -0.09824704,  0.5373548 , -0.0368412 ],
       [-0.04240159,  0.1453751 , -0.09824704, -0.0586048 , -0.0368412 ],
       [ 0.05759841, -0.00136268,  0.15175296, -0.00809975, -0.0368412 ]])

In [148]:
def linear_discriminant_analysis_2class(xs: np.ndarray, ys: np.ndarray) -> Tuple[np.ndarray, float]:
    """
    Learning a LDA model for two classes: learning w and c for checking x^T w > c or not
    :param xs: training data with size (N, D)
    :param ys: training labels with size (N, 1), whose element is 0 or 1

    :return:
        the weights "w" of LDA with size (D, 1),
        the criterion "c"
    """
    # TODO: Implement the LDA method and output the projection vector and the criterion
    xs1 = xs[np.where(ys == 1)[0]]
    xs0 = xs[np.where(ys == 0)[0]]
    m1 = np.mean(xs1, axis = 0).reshape(-1, 1)
    m0 = np.mean(xs0, axis = 0).reshape(-1, 1)
    xs1 = xs1 - np.mean(xs1, axis = 0)
    xs0 = xs1 - np.mean(xs0, axis = 0)
    xs2 = xs - np.mean(xs, axis = 0)
    # Sw = (xs1.T @ xs1) / (xs1.shape[0]) + (xs0.T @ xs0) / xs0.shape[0]
    Sw = xs2.T @ xs2 / (xs2.shape[0] - 1)
    W = np.linalg.inv(np.diag(Sw) * np.eye(Sw.shape[0])) @ (m1 - m0)
    c = (W.T @ (0.5 * (m1 + m0))).item()
    return W, c

In [149]:
w, c = linear_discriminant_analysis_2class(xs, ys)

In [150]:
np.array([1,3,5]) * np.eye(3)

array([[1., 0., 0.],
       [0., 3., 0.],
       [0., 0., 5.]])

In [151]:
ypred = xs @ w
ypred

array([[4.43248012],
       [6.11339017],
       [6.58467303],
       ...,
       [8.0305007 ],
       [5.2424563 ],
       [5.03989542]])

In [152]:
ypred[ypred > c] = 1
ypred[ypred != 1] = 0

In [153]:
1 - (np.sum(np.abs(ypred - ys)) / ypred.shape[0])

0.7374

In [63]:
data['train'][2]

array([0, 1, 0, ..., 1, 1, 0])

In [81]:
data['train'][1] , data['train'][2]

(array([[-1],
        [-1],
        [-1],
        ...,
        [ 1],
        [-1],
        [-1]]),
 array([0, 1, 0, ..., 1, 1, 0]))

In [96]:
c = data['train'][1][data['train'][2] == 1]

In [97]:
d = data['train'][1][data['train'][2] == 0]

In [100]:
c[c == 1].sum() / c.shape[0]

0.3106465005931198

In [99]:
d[d == 1].sum() / d.shape[0]

0.10995085995085994

In [103]:
ind = np.where(data['train'][2] == 0)[0]

In [104]:
ind

array([   0,    2,    4, ..., 9991, 9993, 9999])

In [105]:
np.random.RandomState(1).shuffle(ind)
ind

array([3820, 2971,  966, ..., 3285,  755, 3195])