# Generative Models I

In [110]:
import numpy
import sklearn.datasets 

import matplotlib
import matplotlib.pyplot as plt

import scipy.linalg
import scipy.special

import project_module as pm

inputFile = './input/trainData.txt'

# restart kernel
import importlib
importlib.reload(pm)

D, L = pm.load(inputFile)

In [None]:
# Change default font size - comment to use default values
plt.rc('font', size=16)
plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)

In [7]:
(DTR, LTR), (DVAL, LVAL) = pm.split_db_2to1(D, L)

In [8]:
# Compute a dictionary of ML parameters for each class
def Gau_MVG_ML_estimates(D, L):
    labelSet = set(L)
    hParams = {}
    for lab in labelSet:
        DX = D[:, L==lab]
        hParams[lab] = pm.compute_mu_C(DX)
    return hParams

# Compute a dictionary of ML parameters for each class - Naive Bayes version of the model
# We compute the full covariance matrix and then extract the diagonal. Efficient implementations would work directly with just the vector of variances (diagonal of the covariance matrix)
def Gau_Naive_ML_estimates(D, L):
    labelSet = set(L)
    hParams = {}
    for lab in labelSet:
        DX = D[:, L==lab]
        mu, C = pm.compute_mu_C(DX)
        hParams[lab] = (mu, C * numpy.eye(D.shape[0]))          # multiply by the identity matrix -> diagonal covariance matrix 
    return hParams

# Compute a dictionary of ML parameters for each class - Tied Gaussian model
# We exploit the fact that the within-class covariance matrix is a weighted mean of the covraince matrices of the different classes
def Gau_Tied_ML_estimates(D, L):
    labelSet = set(L)
    hParams = {}
    hMeans = {}
    CGlobal = 0
    for lab in labelSet:
        DX = D[:, L==lab]
        mu, C_class = pm.compute_mu_C(DX)
        CGlobal += C_class * DX.shape[1]
        hMeans[lab] = mu
    CGlobal = CGlobal / D.shape[1]
    for lab in labelSet:
        hParams[lab] = (hMeans[lab], CGlobal)
    return hParams

In [34]:
# Compute per-class log-densities. We assume classes are labeled from 0 to C-1. The parameters of each class are in hParams (for class i, hParams[i] -> (mean, cov))
def compute_log_likelihood_Gau(D, hParams):

    S = numpy.zeros((len(hParams), D.shape[1]))
    for lab in range(S.shape[0]):
        S[lab, :] = pm.logpdf_GAU_ND(D, hParams[lab][0], hParams[lab][1])           # for each sample of the class lab, the value associated to the log-density
    return S

# compute log-postorior matrix from log-likelihood matrix and prior array
def compute_logPosterior(S_logLikelihood, v_prior):
    logSJoint = S_logLikelihood + pm.vcol(numpy.log(v_prior))                   # joint log-density 
    logSMarginal = pm.vrow(scipy.special.logsumexp(logSJoint, axis=0))          # marginal log-density
    logSPost = logSJoint - logSMarginal                                         # log-posteriors
    return logSPost

In [99]:
def binaryMVGModels(name, DTR, LTR, DVAL, LVAL, ML_func=Gau_MVG_ML_estimates, prior1=1/2):
  hParams = ML_func(DTR, LTR)

  S_logLikelihood = compute_log_likelihood_Gau(DVAL, hParams)

  llr = S_logLikelihood[1] - S_logLikelihood[0]
  threshold = -numpy.log( prior1 / (1 - prior1) )     # 0

  predictions_class1 = numpy.array(llr >= threshold)

  n_correct_predictions = numpy.sum(predictions_class1 == LVAL)
  n_wrong_predictions = numpy.sum(predictions_class1 != LVAL)

  print(f'Model: {name}')

  print(f'Total number of samples: {DVAL.shape[1]}')
  print(f'Number of correct predictions: {n_correct_predictions}')
  print(f'Number of wrong predictions: {n_wrong_predictions}')

  accuracy = n_correct_predictions/DVAL.shape[1]
  error_rate = n_wrong_predictions/DVAL.shape[1]

  print(f'Accuracy: {accuracy*100:.2f}%')
  print(f'Error rate: {error_rate*100:.2f}%')

  print()


In [100]:
# MVG
binaryMVGModels("MVG", DTR, LTR, DVAL, LVAL)

# MVG Tied
binaryMVGModels("Tied MVG", DTR, LTR, DVAL, LVAL, ML_func=Gau_Tied_ML_estimates)

# MVG Naive Bayes
binaryMVGModels("Naive Bayes MVG", DTR, LTR, DVAL, LVAL, ML_func=Gau_Naive_ML_estimates)


Model: MVG
Total number of samples: 2000
Number of correct predictions: 1860
Number of wrong predictions: 140
Accuracy: 93.00%
Error rate: 7.00%

Model: Tied MVG
Total number of samples: 2000
Number of correct predictions: 1814
Number of wrong predictions: 186
Accuracy: 90.70%
Error rate: 9.30%

Model: Naive Bayes MVG
Total number of samples: 2000
Number of correct predictions: 1856
Number of wrong predictions: 144
Accuracy: 92.80%
Error rate: 7.20%



MVG results analysis

In [101]:
print("MVG results analysis")
hParams_MVG = Gau_MVG_ML_estimates(DTR, LTR)

Cs = [ hParams_MVG[0][1], hParams_MVG[1][1] ]
classes = ['Fake', 'Genuine']

MVG results analysis


In [102]:
for i, label in enumerate(classes):
  print(f'Class {label}')
  C = Cs[i]
  for dIdx1 in range(C.shape[0]):
    feat_covariances = []
    variance = C[dIdx1][dIdx1]

    for dIdx2 in range(C.shape[1]):
      if(dIdx1 != dIdx2):
        covariance = C[dIdx1][dIdx2]
        feat_covariances.append( (dIdx2+1, covariance, (variance - covariance)) )
    
    print(f'Feature {dIdx1+1} variance: {variance}.')
    for cov_info in feat_covariances:
      print(f'   Covariance with feature {cov_info[0]}: {cov_info[1]} \t(Variance - Covariance: {cov_info[2]:.3f})')
  
  print()

Class Fake
Feature 1 variance: 0.6009565063742803.
   Covariance with feature 2: 5.158665171145649e-05 	(Variance - Covariance: 0.601)
   Covariance with feature 3: 0.01905891448976418 	(Variance - Covariance: 0.582)
   Covariance with feature 4: 0.019252987552578038 	(Variance - Covariance: 0.582)
   Covariance with feature 5: 0.012803940239193384 	(Variance - Covariance: 0.588)
   Covariance with feature 6: -0.013472159793695327 	(Variance - Covariance: 0.614)
Feature 2 variance: 1.4472254272925504.
   Covariance with feature 1: 5.158665171145649e-05 	(Variance - Covariance: 1.447)
   Covariance with feature 3: -0.016134010951526962 	(Variance - Covariance: 1.463)
   Covariance with feature 4: -0.015856147392157894 	(Variance - Covariance: 1.463)
   Covariance with feature 5: -0.026452914117622562 	(Variance - Covariance: 1.474)
   Covariance with feature 6: 0.022913983275498137 	(Variance - Covariance: 1.424)
Feature 3 variance: 0.5653489011392431.
   Covariance with feature 1: 0.01

Pearson Correlation

In [103]:
for i, label in enumerate(classes):
  print(f'Class {label}')
  C = Cs[i]
  Corr = C / ( pm.vcol(C.diagonal()**0.5) * pm.vrow(C.diagonal()**0.5) )
  Corr_rounded = numpy.round(Corr, 2)
  print(Corr_rounded)

Class Fake
[[ 1.    0.    0.03  0.03  0.02 -0.02]
 [ 0.    1.   -0.02 -0.02 -0.03  0.02]
 [ 0.03 -0.02  1.   -0.   -0.01  0.03]
 [ 0.03 -0.02 -0.    1.    0.01  0.02]
 [ 0.02 -0.03 -0.01  0.01  1.    0.02]
 [-0.02  0.02  0.03  0.02  0.02  1.  ]]
Class Genuine
[[ 1.   -0.02  0.01  0.02  0.01 -0.  ]
 [-0.02  1.   -0.02 -0.02 -0.02  0.02]
 [ 0.01 -0.02  1.    0.05 -0.   -0.02]
 [ 0.02 -0.02  0.05  1.   -0.01  0.04]
 [ 0.01 -0.02 -0.   -0.01  1.    0.01]
 [-0.    0.02 -0.02  0.04  0.01  1.  ]]


MVG - No feature 5 and 6

In [106]:
truncatedDTR = DTR[:4]
truncatedDVAL = DVAL[:4]
print(truncatedDTR.shape[0])
binaryMVGModels("MVG - First 4 features", truncatedDTR, LTR, truncatedDVAL, LVAL)
binaryMVGModels("Tied MVG - First 4 features", truncatedDTR, LTR, truncatedDVAL, LVAL, ML_func=Gau_Tied_ML_estimates)
binaryMVGModels("Naive Bayes MVG - First 4 features", truncatedDTR, LTR, truncatedDVAL, LVAL, ML_func=Gau_Naive_ML_estimates)



4
Model: MVG - First 4 features
Total number of samples: 2000
Number of correct predictions: 1841
Number of wrong predictions: 159
Accuracy: 92.05%
Error rate: 7.95%

Model: Tied MVG - First 4 features
Total number of samples: 2000
Number of correct predictions: 1810
Number of wrong predictions: 190
Accuracy: 90.50%
Error rate: 9.50%

Model: Naive Bayes MVG - First 4 features
Total number of samples: 2000
Number of correct predictions: 1847
Number of wrong predictions: 153
Accuracy: 92.35%
Error rate: 7.65%



MVG vs Tied MVG - Features 1-2

In [108]:
binaryMVGModels("MVG - Features 1-2", DTR[:2], LTR, DVAL[:2], LVAL)
binaryMVGModels("Tied MVG - Features 1-2", DTR[:2], LTR, DVAL[:2], LVAL, ML_func=Gau_Tied_ML_estimates)


Model: MVG - Features 1-2
Total number of samples: 2000
Number of correct predictions: 1270
Number of wrong predictions: 730
Accuracy: 63.50%
Error rate: 36.50%

Model: Tied MVG - Features 1-2
Total number of samples: 2000
Number of correct predictions: 1011
Number of wrong predictions: 989
Accuracy: 50.55%
Error rate: 49.45%



In [109]:
binaryMVGModels("MVG - Features 3-4", DTR[2:4], LTR, DVAL[2:4], LVAL)
binaryMVGModels("Tied MVG - Features 3-4", DTR[2:4], LTR, DVAL[2:4], LVAL, ML_func=Gau_Tied_ML_estimates)

Model: MVG - Features 3-4
Total number of samples: 2000
Number of correct predictions: 1811
Number of wrong predictions: 189
Accuracy: 90.55%
Error rate: 9.45%

Model: Tied MVG - Features 3-4
Total number of samples: 2000
Number of correct predictions: 1812
Number of wrong predictions: 188
Accuracy: 90.60%
Error rate: 9.40%



PCA as pre-processing

In [116]:
for m in range(6):
  print()
  print(f'PCA pre-processing - {m+1} directions\n')
  UPCA = pm.compute_pca(DTR, m+1)     # trained model
  DTR_pca = pm.apply_pca(UPCA, DTR)
  DVAL_pca = pm.apply_pca(UPCA, DVAL)

  binaryMVGModels("MVG", DTR_pca, LTR, DVAL_pca, LVAL)
  binaryMVGModels("Tied MVG", DTR_pca, LTR, DVAL_pca, LVAL, ML_func=Gau_Tied_ML_estimates)
  binaryMVGModels("Naive Bayes MVG", DTR_pca, LTR, DVAL_pca, LVAL, ML_func=Gau_Naive_ML_estimates)


PCA pre-processing - 1 directions

Model: MVG
Total number of samples: 2000
Number of correct predictions: 1815
Number of wrong predictions: 185
Accuracy: 90.75%
Error rate: 9.25%

Model: Tied MVG
Total number of samples: 2000
Number of correct predictions: 1813
Number of wrong predictions: 187
Accuracy: 90.65%
Error rate: 9.35%

Model: Naive Bayes MVG
Total number of samples: 2000
Number of correct predictions: 1815
Number of wrong predictions: 185
Accuracy: 90.75%
Error rate: 9.25%


PCA pre-processing - 2 directions

Model: MVG
Total number of samples: 2000
Number of correct predictions: 1824
Number of wrong predictions: 176
Accuracy: 91.20%
Error rate: 8.80%

Model: Tied MVG
Total number of samples: 2000
Number of correct predictions: 1815
Number of wrong predictions: 185
Accuracy: 90.75%
Error rate: 9.25%

Model: Naive Bayes MVG
Total number of samples: 2000
Number of correct predictions: 1823
Number of wrong predictions: 177
Accuracy: 91.15%
Error rate: 8.85%


PCA pre-processin