<h1>Supervised Transfer Learning tutorial : parameter-based approaches </h1>

In [1]:
import sys
import copy
import pandas as pd
import  numpy as np
import sklearn.metrics as metr
import sklearn.model_selection as select
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

import adapt._tree_utils as ut

<h2>Exercise 1 : Transfer on Linear Regression </h2>


In [None]:
def label_func(x,beta=2,noise=0.1):
    return beta*x + noise * np.random.randn(len(x))

def gaussian(x, mu=0., s=1.):
    return 1./np.sqrt( 2. * np.pi * s**2 ) * np.exp( -(x-mu)**2 / ( 2. * s**2 ) )

np.random.seed(0)
mu_s = 0
sigma = 1
beta = 2
var=0.5
epsilon = var*np.random.randn(1)[0]

Xs = np.random.randn(100) * sigma + mu_s
Xt_all = np.random.randn(100) * sigma + mu_s 

n_train = 5
Xt = Xt_all[:n_train]
ys = label_func(Xs,beta=beta,noise=0.5)
yt_all = label_func(Xt_all,beta=beta+abs(epsilon),noise=0.5)
yt = yt_all[:n_train]
# Warning : we will use only Xt,yt as training data but Xt_all,yt_all to assess performance 

### Question 1.1 : Formalize the transfer learning problem
First plot Source and Target data.

Write $P_S(X)$ and $P_T(X)$

Here we have a $\beta_S,\beta_T$ parameters controlling respectively Source and Target distributions.
Both present the same noise we note $\sigma$.

Express $P_S(Y/X=x)$ and $P_T(Y/X=x)$ in function of these parameters.

### Question 1.2 : 

We can note $\beta_T = \beta_S+ |\epsilon|$.
Express the distribution $P(\epsilon)$ depending on its mean and standard deviation $\mu_\epsilon,\sigma_\epsilon$.
What is the mean expected value of $\mathbb{E}[\beta_S+ \epsilon]$ ? Is it the same as $\mathbb{E}[\beta_T]$ ?
                                                                                                   
(Bonus) Write the formula corresponding to $\mathbb{E}[Y|X=x]$.
                                                                                                   
Is this a Covariate Shift situation ? Why ?  

### Question 1.3 : Parameter-based transfer on linear model

Train a linear regression model on Source only and another linear regression model on Target only.

In [None]:
from sklearn.linear_model import LinearRegression
from adapt.parameter_based import RegularTransferLR

def mse(y1,y2):
    return np.mean(np.square(y1 - y2))

    
source_model = #...
source_model.fit(Xs.reshape(-1, 1), ys)

only_tgt_model = #...
only_tgt_model.fit(Xt.reshape(-1, 1), yt)

### Question 1.4 : 
Transfer Source model using Target data and `RegularTransferLR` algorithm.

Write the optimization problem solved by this algorithm (see course or documentation).
What is the role of `lambda_` parameter ?

In [None]:
tgt_model = RegularTransferLR(source_model, lambda_=1.)
tgt_model.fit(Xt.reshape(-1, 1), yt)

### Question 1.5 : Compare performance of these three linear regressors
Is the score provided equal to $1 - mse$ (mean squared error)? Why ?

We note $\widehat{\beta_S},\widehat{\beta_T},\widehat{\beta_{transf}}$ parameters estimated by these regressors.
Express these two printed performance measures in function of these estimators.

In [None]:
#Predictions of source model on source data:
yps = source_model.predict(Xs.reshape(-1, 1))  
err_src = np.mean(np.square(yps - ys))

#Predictions of target only linear model on target data:
ypt_only = #...
err_tgt_only = #...

#Predictions of transferred linear model on target data:
ypt = #....
err_tgt = #...

print("Source average squared error : %.4f"%err_src)
print("Target only average squared error : %.4f"%err_tgt_only)
print("Target average squared error : %.4f"%err_tgt)

print("Source average score : %.4f"%source_model.score(Xs.reshape(-1, 1), ys))
print("Target only average score : %.4f"%only_tgt_model.score(Xt.reshape(-1, 1), yt))
print("Target average score : %.4f"%tgt_model.score(Xt.reshape(-1, 1), yt))

### Question 2.1 : 
Write a python function to repeat the experiment with various values of `lambda_`, `var` and various amount of training data.

Add performance assessment of source model on target data.

In [None]:
def experiment_mse(var=1,lambda_=1,n_train=10,n_exp=10):
    err_src = np.zeros(n_exp)
    err_src_only = np.zeros(n_exp)
    err_tgt = np.zeros(n_exp)
    err_tgt_only = np.zeros(n_exp)
    
    for i in range(n_exp):
        """
        #...............
        """
        
        err_src[i] = mse(source_model.predict(Xs.reshape(-1,1)),ys)
        err_src_only[i] = mse(source_model.predict(Xt_all.reshape(-1,1)),yt_all)
        err_tgt_only[i] = mse(only_tgt_model.predict(Xt_all.reshape(-1,1)),yt_all)
        err_tgt[i] = mse(tgt_model.predict(Xt_all.reshape(-1,1)),yt_all)

    return np.mean(err_src),np.mean(err_src_only),np.mean(err_tgt),np.mean(err_tgt_only)

def experiment_score(var=1,lambda_=1,n_train=10,n_exp=10):
    score_src = np.zeros(n_exp)
    score_src_only = np.zeros(n_exp)
    score_tgt = np.zeros(n_exp)
    score_tgt_only = np.zeros(n_exp)
    
    for i in range(n_exp):
        """
        #...............
        """

    return np.mean(score_src),np.mean(score_src_only),np.mean(score_tgt),np.mean(score_tgt_only)

err_src,err_src_only,err_tgt,err_tgt_only = experiment_mse(var=0.5,lambda_=1,n_train=5,n_exp=100)
score_src,score_src_only,score_tgt,score_tgt_only = experiment_score(var=0.5,lambda_=1,n_train=5,n_exp=100)

print("Source average squared error : %.4f"%err_src)
print("Source only average squared error : %.4f"%err_src_only)
print("Target only average squared error : %.4f"%err_tgt_only)
print("Target average squared error : %.4f"%err_tgt)

print("Source average score : %.4f"%score_src)
print("Source only average score : %.4f"%score_src_only)
print("Target only average score : %.4f"%score_tgt_only)
print("Target average score : %.4f"%score_tgt)

### Question 2.2: 

What seems to be the best value for `lambda_` ?
Try independently `n_train=50` and `lambda_=3`, `lambda_=10` and `var=2` 
What can you observe ?

Try to change $\epsilon$ random variable for an uniform distribution.
What are your observations ?

In which conditions this transfer approach is usefull ?

### (Bonus) Question 3 : 
Express the closed-form of $\widehat{ \beta_{transf} }$

<h2>Exercise 2 : Transfer on Decision Trees (on synthetic data)</h2>

<h3>1. Classification of Gaussian clusters </h3>

In [None]:
mean_s_0 = np.array([-1,0])
mean_s_1 = np.array([1,0])
mean_t_0 = np.array([-1,0])
mean_t_1 = np.array([0,0])

sig_s_0 = np.diag([1,1])
sig_s_1 = np.diag([1,1])
sig_t_0 = np.diag([2,2])
sig_t_1 = np.diag([2,2])

size=100
ns_0 = size//2
nt_0 = size//2

Xs_0 = np.random.multivariate_normal(mean_s_0, sig_s_0, size=ns_0)
Xs_1 = np.random.multivariate_normal(mean_s_1, sig_s_1, size=size-ns_0)

Xt_0 = np.random.multivariate_normal(mean_t_0, sig_t_0, size=nt_0)
Xt_1 = np.random.multivariate_normal(mean_t_1, sig_t_1, size=size-nt_0)

Xs = np.r_[Xs_0,Xs_1]
Xt = np.r_[Xt_0,Xt_1]

ys = np.zeros(size)
ys[ns_0:] = 1
yt = np.zeros(size)
yt[nt_0:] = 1

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(6, 3))
ax[0].scatter(Xs[:ns_0, 0], Xs[:ns_0, 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[0].scatter(Xs[ns_0:, 0], Xs[ns_0:, 1],marker='o',edgecolor='black',color='red',label='class 1')
ax[0].set_title('Source data')
ax[0].legend()

ax[1].scatter(Xt[:nt_0, 0], Xt[:nt_0, 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[1].scatter(Xt[nt_0:, 0], Xt[nt_0:, 1],marker='o',edgecolor='black',color='red',label='class 1')
ax[1].set_title('Target data')
ax[1].legend()

### Question 1.1 :
Express $P_S(X|Y=y)$ and $P_T(X|Y=y)$.
What kind of transformation does this represent ?

### Question 1.2 :
How can we change parameters to get a Target shift situation ?

Train linear SVM classification models both on Source and Target and compare their performance.

In [None]:
from sklearn.svm import SVC
clf_source = SVC(kernel='linear')

#...
#...
#...

In [None]:
#This code plots the Source decision function on Source and Target data

plot_step = 0.1
x_min, x_max = Xt[:, 0].min() - 1, Xt[:, 0].max() + 1
y_min, y_max = Xt[:, 1].min() - 1, Xt[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),np.arange(y_min, y_max, plot_step))

#Source decision function:
ypred_src = clf_source.predict(np.c_[xx.ravel(), yy.ravel()])
ypred_src = ypred_src.reshape(xx.shape)
    
fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(6, 3))
ax[0].scatter(Xs[:ns_0, 0], Xs[:ns_0, 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[0].scatter(Xs[ns_0:, 0], Xs[ns_0:, 1],marker='o',edgecolor='black',color='red',label='class 1')

ax[0].contourf(xx, yy, ypred_src, cmap=plt.cm.coolwarm, alpha=0.8)
ax[0].set_title('Source data')
ax[0].legend()

ax[1].scatter(Xt[:nt_0, 0], Xt[:nt_0, 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[1].scatter(Xt[nt_0:, 0], Xt[nt_0:, 1],marker='o',edgecolor='black',color='red',label='class 1')

ax[1].contourf(xx, yy, ypred_src, cmap=plt.cm.coolwarm, alpha=0.8)
ax[1].set_title('Target data')
ax[1].legend()

### Question 1.3 :
Draw, similarly to the previous, plot the target decision function.

### Question 1.4:

Write a $KL(P_S^0,P_S^1,P_T^0,P_T^1)$ that returns Kullback-Leibler divergence betwwen source and target class proportions.

Compute it both on "right" and "left" of the linear SVM classifier.

### Question 1.5:

Write a divergence gain function that computes the gain acquired by the linear SVM classifier (see course).

### (Bonus) Question 1.6 :

Suppose $Q_1 \sim  \mathcal{N}(\mu_1,\sigma_1)$ and $Q_2 \sim  \mathcal{N}(\mu_2,\sigma_2)$,

Prove that $KL[ Q_1 || Q_2] = \frac{1}{2}\left( \frac{(\mu_2 - \mu_1)^2}{\sigma_2^2} + \frac{\sigma_1^2}{\sigma_2^2} -\ln{\frac{\sigma_1^2}{\sigma_2^2}} - 1 \right)$ 

<h3>2. Sum of Gaussian clusters and decision tree classifiers </h3>
Now we will experiment several basic transformations between Gaussian distributions with several clusters per class.

In [None]:
def create_N_clusters(N,class_,mean_range=[-2,2],sig_range=[0.5,2]):
    classes = np.repeat(class_,N)
    means = np.zeros(N,dtype=object)
    sigs = np.zeros(N,dtype=object)
    for k in range(N):
        means[k]=np.random.uniform(low=mean_range[0], high=mean_range[1], size=2)
        sigs[k]=np.diag(np.random.uniform(low=sig_range[0], high=sig_range[1], size=2))
    return means,sigs,classes

def create_clusters(N0,N1,mean_range=[-2,2],sig_range=[0.5,2]):
    means_0,sigs_0,classes_0=create_N_clusters(N0,0,mean_range=mean_range,sig_range=sig_range)
    means_1,sigs_1,classes_1=create_N_clusters(N1,1,mean_range=mean_range,sig_range=sig_range)
    
    means = np.concatenate((means_0,means_1))
    sigs = np.concatenate((sigs_0,sigs_1))
    classes = np.concatenate((classes_0,classes_1))
    return means,sigs,classes

def translate_N_clusters(N,means,sigs,classes,mean_range=[-2,2],sig_range=[0.5,2]):
    indexes = np.random.choice(classes.size,size=N,replace=False)
    for k in indexes:
        means[k]=np.random.uniform(low=mean_range[0], high=mean_range[1], size=2) 
    return means,sigs,classes

def shrink_N_clusters(N,means,sigs,classes,mean_range=[-2,2],sig_range=[0.5,2]):
    indexes = np.random.choice(classes.size,size=N,replace=False)
    for k in indexes:
        sigs[k]=np.diag(np.random.uniform(low=sig_range[0], high=sig_range[1], size=2))  
    return means,sigs,classes

def delete_N_clusters(N,means,sigs,classes,mean_range=[-2,2],sig_range=[0.5,2]):
    indexes = np.random.choice(classes.size,size=N,replace=False)
    return np.delete(means,indexes,axis=0),np.delete(sigs,indexes,axis=0),np.delete(classes,indexes,axis=0)

def add_N_clusters(N,class_,means,sigs,classes,mean_range=[-2,2],sig_range=[0.5,2]):
    new_means,new_sigs,new_classes=create_N_clusters(N,class_,mean_range=mean_range,sig_range=sig_range)
    means = np.concatenate((means,new_means))
    sigs = np.concatenate((sigs,new_sigs))
    classes = np.concatenate((classes,new_classes))
    return means,sigs,classes

def generate_samples(n_by_cluster,means,sigs,classes):
    for k,c in enumerate(classes):
        X_ = np.random.multivariate_normal(means[k], sigs[k], size=n_by_cluster)
        y_ = np.repeat(c,n_by_cluster)
        if k == 0:
            X = X_
            y = y_
        else:
            X = np.r_[X,X_]
            y = np.r_[y,y_]
    return X,y

### Translations between Source and Target

In [None]:
np.random.seed(0)
N0=5
N1=5
means,sigs,classes = create_clusters(N0,N1,mean_range=[-5,5],sig_range=[0.5,2])
Xs,ys=generate_samples(100,means,sigs,classes)
#Translation:
means,sigs,classes = translate_N_clusters(2,means,sigs,classes,mean_range=[-5,5],sig_range=[0.5,2])
Xt,yt=generate_samples(100,means,sigs,classes)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(6, 3))
ax[0].scatter(Xs[np.where(ys==0)[0], 0], Xs[np.where(ys==0)[0], 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[0].scatter(Xs[np.where(ys==1)[0], 0], Xs[np.where(ys==1)[0], 1],marker='o',edgecolor='black',color='red',label='class 1')
ax[0].set_title('Source data')
ax[0].legend()

ax[1].scatter(Xt[np.where(yt==0)[0], 0], Xt[np.where(yt==0)[0], 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[1].scatter(Xt[np.where(yt==1)[0], 0], Xt[np.where(yt==1)[0], 1],marker='o',edgecolor='black',color='red',label='class 1')
ax[1].set_title('Target data')
ax[1].legend()

### Question 2.1 : Source and Target decision trees.

Train both Source and Target decision tree classification models and assess them using cross-validation.

In [None]:
n_source_by_c = 50
n_target_by_c = 10

means_src,sigs_src,classes_src = create_clusters(N0,N1,mean_range=[-5,5],sig_range=[0.5,2])


Xs,ys=generate_samples(n_source_by_c,means_src,sigs_src,classes_src)
#Translation:
means,sigs,classes = translate_N_clusters(2,means_src,sigs_src,classes_src,mean_range=[-5,5],sig_range=[0.5,2])
Xt,yt=generate_samples(n_target_by_c,means,sigs,classes)

MAX = 5
K_FOLD = 10

# Source and Target classifiers :
clf_source = DecisionTreeClassifier(max_depth=MAX)
clf_target = DecisionTreeClassifier(max_depth=MAX)


#K folds coss-validation:
skf = select.StratifiedKFold(n_splits=K_FOLD)

score_src = np.zeros(K_FOLD)
score_src_tgt = np.zeros(K_FOLD)
score_tgt_src = np.zeros(K_FOLD)
score_tgt = np.zeros(K_FOLD)

k=0
for train, test in skf.split(Xs, ys):
    X_src_train, Y_src_train, X_src_test, Y_src_test = Xs[train], ys[train], Xs[test], ys[test]
    
    #...

    k+=1

k=0
for train, test in skf.split(Xt, yt):
    X_tgt_train, Y_tgt_train, X_tgt_test, Y_tgt_test = Xt[train], yt[train], Xt[test], yt[test]

    #...

    k+=1
    
print('Score Target model: {:.3f}'.format(np.mean(score_tgt)))
print('Score Source model: {:.3f}'.format(np.mean(score_src)))
print('Score Source model on Target: {:.3f}'.format(np.mean(score_src_tgt)))
#print('Score Target model on Source: {:.3f}'.format(np.mean(score_tgt_src)))

### Question 2.2 : Parameter-based transfer on Decision Trees

Using source model and target data, train STRUT and SER decision tree classification models and assess them using cross-validation.

In [None]:
from adapt.parameter_based import TransferTreeClassifier, TransferForestClassifier
import copy

clf_src = copy.deepcopy(clf_source)

strut_model = TransferTreeClassifier(estimator=clf_src,algo="strut")
ser_model = TransferTreeClassifier(estimator=clf_src,algo="ser")

score_strut= np.zeros(K_FOLD)
score_ser = np.zeros(K_FOLD)

k=0
for train, test in skf.split(Xt, yt):
    X_tgt_train, Y_tgt_train, X_tgt_test, Y_tgt_test = Xt[train], yt[train], Xt[test], yt[test]
    strut_model.fit(X_tgt_train, Y_tgt_train)
    ser_model.fit(X_tgt_train, Y_tgt_train)
    score_strut[k] = strut_model.score(X_tgt_test, Y_tgt_test)
    score_ser[k] = ser_model.score(X_tgt_test, Y_tgt_test)

    k+=1

print('Score STRUT model: {:.3f}'.format(np.mean(score_strut)))
print('Score SER model: {:.3f}'.format(np.mean(score_ser)))

#### Visualization of decision trees :

In [None]:
# Functions used for DT visualization

def is_same_node(tree1,tree2,node1,node2):

    if (node1 == -1 or node2 == -1):
        return False
    if tree1.tree_.feature[node1] != tree2.tree_.feature[node2]:
        return False
    if tree1.tree_.threshold[node1] != tree2.tree_.threshold[node2]:
        return False
    
    return True

def highlight_different_nodes(tree1,tree2,node1,node2):
    
    list_nodes1 = []
    list_nodes2 = []
    
    if is_same_node(tree1,tree2,node1,node2) :
        left1 = tree1.tree_.children_left[node1]
        right1 = tree1.tree_.children_right[node1]
        left2 = tree2.tree_.children_left[node2]
        right2 = tree2.tree_.children_right[node2]
        nl1, nl2 = highlight_different_nodes(tree1,tree2,left1,left2)
        nr1, nr2 = highlight_different_nodes(tree1,tree2,right1,right2)
        
        list_nodes1 = list_nodes1 + nl1 + nr1
        list_nodes2 = list_nodes2 + nl2 + nr2
    else:        
        list_nodes1 = list_nodes1 + ut.sub_nodes(tree1.tree_, node1)
        list_nodes2 = list_nodes2 + ut.sub_nodes(tree2.tree_, node2)
        
    return list_nodes1, list_nodes2
    
class TreeDot():
    def __init__(self,name_file,path=''):
        self.nodes = list()
        self.path = path
        self.path_file = path+name_file
        f = open(self.path_file,"r")
        self.lines = f.readlines()
        for line in self.lines :
            start = line.split(' ')[0]
            
            if start.isnumeric():
                self.nodes.append(int(start))
                
        self.nodes = set(self.nodes)
        
        self.leaves = self._list_leaves()
        f.close()
        
    def _find_node(self,id_node):
        ans = -1

        #f = open(self.path_file,"r")
        #lines = f.readlines()
        for k,line in enumerate(self.lines) :
            if str(id_node)==line.split(' [')[0] :
            #if line[0:3] == (str(id_node)+ ' [') :
                ans = k
        return ans
            
    def _check_keyword(self,id_nodes,keyword):
        checks = np.zeros(len(id_nodes))

        #f = open(self.path_file,"r")
        #lines = f.readlines()

        for j,i in enumerate(id_nodes):
            id_l = self._find_node(i) 
            if (' '+keyword) in self.lines[id_l]:
                checks[j] = True
            else:
                checks[j] = False
                
        return checks
            
    def _change_color(self,id_nodes,color_,keyword,out_file=None):
        if out_file is not None:
            f = open(self.path+out_file,"w+")
        else:             
            f = open(self.path_file,"r+")
            
        #f_init = open(self.path_file,"r")
        #lines = f_init.readlines()
        new_lines = self.lines.copy()
        
        checks = self._check_keyword(id_nodes,keyword)
            
        for j,i in enumerate(id_nodes):
            id_l = self._find_node(i)
            
            if checks[j]:
                s_c = new_lines[id_l].split(keyword+'="')[1]
                s_c = s_c.split('"')[0]
                
                new_lines[id_l] = new_lines[id_l].replace(s_c,color_)
            else:
                new_lines[id_l] = new_lines[id_l].replace('] ;',' '+keyword+'="'+color_+'"] ;')

        f.writelines(s + '\n' for s in new_lines)
        #f_init.close()
        #self.lines = f.readlines()
        f.close()
    
    def _add_attribute(self,id_nodes,keyword,value,out_file=None):
        if out_file is not None:
            f = open(self.path+out_file,"w+")
        else:             
            f = open(self.path_file,"r+")
            
        #f_init = open(self.path_file,"r")
        #lines = f_init.readlines()
        new_lines = self.lines.copy()
        for j,i in enumerate(id_nodes):
            id_l = self._find_node(i)

            new_lines[id_l] = new_lines[id_l].replace('] ;',' '+keyword+'='+str(value)+'] ;')

        f.writelines(s + '\n' for s in new_lines)
        #f_init.close()
        #self.lines = f.readlines()
        f.close()     
        
    def _extract_node_info(self,id_):
        attributes = []
        values= []
        
        id_l = self._find_node(id_)
        
        line = self.lines[id_l]
        infos = line.split('label="')[1].split('"')[0]
        list_infos = infos.split('\n')
        
        for l in list_infos:
            attributes.append(l.split(' = ')[0])
            values.append(l.split(' = ')[1])
        return attributes, values
    
    def _write_node_info(self,id_,attr,v):
        f = open(self.path_file,"r+")

        id_l = self._find_node(id_)        
        line = self.lines[id_l]        
        f.close()
        
    def _is_leaf(self,id_):
        
        attr,v = self._extract_node_info(id_)
        if 'X' in attr[0]:
            return False
        else:
            return True
        
    def _list_leaves(self):
        leaves = []
        for n in self.nodes:
            if self._is_leaf(n):
                leaves.append(n)
        return leaves
            
    def _format_leaves(self):
        self._add_attribute(self.leaves,'shape','ellipse',out_file=None)
            
        
    def _change_fillcolor(self,id_nodes,color_,out_file=None):
        if out_file is not None:
            f = open(self.path+out_file,"w+")
        else:             
            f = open(self.path_file,"r+")
            
        f_init = open(self.path_file,"r")
        lines = f_init.readlines()
        new_lines = lines.copy()
        
        checks = self._check_keyword(id_nodes,'fillcolor')
            
        for j,i in enumerate(id_nodes):
            id_l = self._find_node(i)
            
            if checks[j]:
                s_c = new_lines[id_l].split('fillcolor="')[1]
                s_c = s_c.split('"')[0]
                
                new_lines[id_l] = new_lines[id_l].replace(s_c,color_)
            else:
                new_lines[id_l] = new_lines[id_l].replace('] ;',' fillcolor="'+color_+'"] ;')

        f.writelines(s + '\n' for s in new_lines)
        f_init.close()
        f.close()
        
    def _change_edgecolor(self,id_nodes,color_,out_file=None):
        if out_file is not None:
            f = open(self.path+out_file,"w+")
        else:             
            f = open(self.path_file,"r+")
            
        f_init = open(self.path_file,"r")    
        lines = f_init.readlines()
        new_lines = lines.copy()
        checks = self._check_keyword(id_nodes,'color')
            
        for j,i in enumerate(id_nodes):
            id_l = self._find_node(i)
            
            if checks[j]:
                s_c = new_lines[id_l].split('color="')[1]
                s_c = s_c.split('"')[0]
                
                new_lines[id_l] = new_lines[id_l].replace(s_c,color_)
            else:
                new_lines[id_l] = new_lines[id_l].replace('] ;',' color="'+color_+'"] ;')

        f.writelines(s + '\n' for s in new_lines)
        f_init.close()
        f.close()      

In [None]:
from sklearn.tree import export_graphviz as exp_draw
from matplotlib import image
import subprocess
import os

your_path = ''

file_out = exp_draw(clf_source,out_file=your_path+'tree_src.dot',filled=True,impurity=False)
file_out = exp_draw(ser_model.estimator_,out_file=your_path+'tree_ser.dot',filled=True,impurity=False)
file_out = exp_draw(strut_model.estimator_,out_file=your_path+'tree_strut.dot',filled=True,impurity=False)

os.system('dot -Tpng '+your_path+'tree_src.dot -o '+your_path+'tree_src.png')
os.system('dot -Tpng '+your_path+'tree_ser.dot -o '+your_path+'tree_ser.png')
os.system('dot -Tpng '+your_path+'tree_strut.dot -o '+your_path+'tree_strut.png')

tree_img = image.imread('tree_src.png')
tree_ser_img = image.imread('tree_ser.png')
tree_strut_img = image.imread('tree_strut.png')
#plt.imshow(tree_img)
#plt.imshow(tree_t_img)

l1,l2 = highlight_different_nodes(clf_source,ser_model.estimator_,0,0)
print(l1)
print(l2)
T_D_src = TreeDot('tree_src.dot',your_path)
T_D_ser = TreeDot('tree_ser.dot',your_path)
T_D_strut = TreeDot('tree_strut.dot',your_path)


T_D_src._change_color(l1,'red','color',out_file='tree_src_updated.dot')
T_D_src = TreeDot('tree_src_updated.dot',your_path)
T_D_src._add_attribute(l1,'penwidth',3,out_file='tree_src_updated.dot')

T_D_ser._change_color(l2,'red','color',out_file='tree_ser_updated.dot')
T_D_ser = TreeDot('tree_ser_updated.dot',your_path)
T_D_ser._add_attribute(l2,'penwidth',3,out_file='tree_ser_updated.dot')

T_D_strut._change_color(l2,'red','color',out_file='tree_strut_updated.dot')
T_D_strut = TreeDot('tree_strut_updated.dot',your_path)
T_D_strut._add_attribute(l2,'penwidth',3,out_file='tree_strut_updated.dot')

os.system('dot -Tpng '+your_path+'tree_src_updated.dot -o '+your_path+'tree_src_h.png')
os.system('dot -Tpng '+your_path+'tree_ser_updated.dot -o '+your_path+'tree_ser_h.png')
os.system('dot -Tpng '+your_path+'tree_strut_updated.dot -o '+your_path+'tree_strut_h.png')

tree_src = image.imread(your_path+'tree_src_h.png')
tree_ser = image.imread(your_path+'tree_ser_h.png')
tree_strut = image.imread(your_path+'tree_strut_h.png')

plt.imshow(tree_src)

In [None]:
plt.imshow(tree_ser)

In [None]:
plt.imshow(tree_strut)

### Adding and deleting clusters between Source and Target


### Question 2.3 :

Execute same previous steps on this new Source/Target transformation.

In [None]:
np.random.seed(0)
N0=5
N1=5

Xs,ys=generate_samples(100,means_src,sigs_src,classes_src)

#Adding and deleting clusters:
means,sigs,classes = delete_N_clusters(4,means_src,sigs_src,classes_src,mean_range=[-5,5],sig_range=[0.5,2])

means,sigs,classes = add_N_clusters(2,0,means,sigs,classes,mean_range=[-5,5],sig_range=[0.5,2])
means,sigs,classes = add_N_clusters(2,1,means,sigs,classes,mean_range=[-5,5],sig_range=[0.5,2])

Xt,yt=generate_samples(100,means,sigs,classes)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(6, 3))
ax[0].scatter(Xs[np.where(ys==0)[0], 0], Xs[np.where(ys==0)[0], 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[0].scatter(Xs[np.where(ys==1)[0], 0], Xs[np.where(ys==1)[0], 1],marker='o',edgecolor='black',color='red',label='class 1')
ax[0].set_title('Source data')
ax[0].legend()

ax[1].scatter(Xt[np.where(yt==0)[0], 0], Xt[np.where(yt==0)[0], 1],marker='o',edgecolor='black',color='blue',label='class 0')
ax[1].scatter(Xt[np.where(yt==1)[0], 0], Xt[np.where(yt==1)[0], 1],marker='o',edgecolor='black',color='red',label='class 1')
ax[1].set_title('Target data')
ax[1].legend()

In [None]:
n_source_by_c = 50
n_target_by_c = 10

Xs,ys=generate_samples(n_source_by_c,means_src,sigs_src,classes_src)
#Adding and deleting clusters:
means,sigs,classes = delete_N_clusters(4,means_src,sigs_src,classes_src,mean_range=[-5,5],sig_range=[0.5,2])

means,sigs,classes = add_N_clusters(2,0,means,sigs,classes,mean_range=[-5,5],sig_range=[0.5,2])
means,sigs,classes = add_N_clusters(2,1,means,sigs,classes,mean_range=[-5,5],sig_range=[0.5,2])
Xt,yt=generate_samples(n_target_by_c,means,sigs,classes)

MAX = 5
K_FOLD = 10

# Source and Target classifiers :
clf_source = DecisionTreeClassifier(max_depth=MAX)
clf_target = DecisionTreeClassifier(max_depth=MAX)


#K folds coss-validation:
skf = select.StratifiedKFold(n_splits=K_FOLD)

score_src = np.zeros(K_FOLD)
score_src_tgt = np.zeros(K_FOLD)
score_tgt_src = np.zeros(K_FOLD)
score_tgt = np.zeros(K_FOLD)

"""
#.....................
"""

print('Score Target model: {:.3f}'.format(np.mean(score_tgt)))
print('Score Source model: {:.3f}'.format(np.mean(score_src)))
print('Score Source model on Target: {:.3f}'.format(np.mean(score_src_tgt)))
#print('Score Target model on Source: {:.3f}'.format(np.mean(score_tgt_src)))

In [None]:
clf_src = copy.deepcopy(clf_source)

strut_model = TransferTreeClassifier(estimator=clf_src,algo="strut")
ser_model = TransferTreeClassifier(estimator=clf_src,algo="ser")

score_strut= np.zeros(K_FOLD)
score_ser = np.zeros(K_FOLD)

k=0
"""
#.....................
"""

print('Score STRUT model: {:.3f}'.format(np.mean(score_strut)))
print('Score SER model: {:.3f}'.format(np.mean(score_ser)))

#### Visualization of decision trees :

In [None]:
your_path = ''

file_out = exp_draw(clf_source,out_file=your_path+'tree_src.dot',filled=True,impurity=False)
file_out = exp_draw(ser_model.estimator_,out_file=your_path+'tree_ser.dot',filled=True,impurity=False)
file_out = exp_draw(strut_model.estimator_,out_file=your_path+'tree_strut.dot',filled=True,impurity=False)

os.system('dot -Tpng '+your_path+'tree_src.dot -o '+your_path+'tree_src.png')
os.system('dot -Tpng '+your_path+'tree_ser.dot -o '+your_path+'tree_ser.png')
os.system('dot -Tpng '+your_path+'tree_strut.dot -o '+your_path+'tree_strut.png')

tree_img = image.imread('tree_src.png')
tree_ser_img = image.imread('tree_ser.png')
tree_strut_img = image.imread('tree_strut.png')
#plt.imshow(tree_img)
#plt.imshow(tree_t_img)

l1,l2 = highlight_different_nodes(clf_source,ser_model.estimator_,0,0)
print(l1)
print(l2)
T_D_src = TreeDot('tree_src.dot',your_path)
T_D_ser = TreeDot('tree_ser.dot',your_path)
T_D_strut = TreeDot('tree_strut.dot',your_path)


T_D_src._change_color(l1,'red','color',out_file='tree_src_updated.dot')
T_D_src = TreeDot('tree_src_updated.dot',your_path)
T_D_src._add_attribute(l1,'penwidth',3,out_file='tree_src_updated.dot')

T_D_ser._change_color(l2,'red','color',out_file='tree_ser_updated.dot')
T_D_ser = TreeDot('tree_ser_updated.dot',your_path)
T_D_ser._add_attribute(l2,'penwidth',3,out_file='tree_ser_updated.dot')

T_D_strut._change_color(l2,'red','color',out_file='tree_strut_updated.dot')
T_D_strut = TreeDot('tree_strut_updated.dot',your_path)
T_D_strut._add_attribute(l2,'penwidth',3,out_file='tree_strut_updated.dot')

os.system('dot -Tpng '+your_path+'tree_src_updated.dot -o '+your_path+'tree_src_h.png')
os.system('dot -Tpng '+your_path+'tree_ser_updated.dot -o '+your_path+'tree_ser_h.png')
os.system('dot -Tpng '+your_path+'tree_strut_updated.dot -o '+your_path+'tree_strut_h.png')

tree_src = image.imread(your_path+'tree_src_h.png')
tree_ser = image.imread(your_path+'tree_ser_h.png')
tree_strut = image.imread(your_path+'tree_strut_h.png')

plt.imshow(tree_src)

In [None]:
plt.imshow(tree_ser)

In [None]:
plt.imshow(tree_strut)

### Question 2.4 : Compare SER/STRUT results on these two different transfer situations

Why SER seems to outperform STRUT on both of them ?

How can you explain that based on visualization of transferred decision trees ?

<h2>Exercise 3 : Transfer learning on Random Forests (on fall detection data)</h2>

In [None]:
from adapt.parameter_based import TransferTreeClassifier, TransferForestClassifier
import copy
from adapt.base import BaseAdaptEstimator
from adapt.utils import check_arrays, set_random_seed, check_estimator, check_fitted_estimator
from sklearn.metrics import roc_auc_score as _auc_

import warnings
warnings.filterwarnings("ignore")

class TransferTreeSelector(BaseAdaptEstimator):
    """
    """
    def __init__(self,
                 estimator=None,
                 Xt=None,
                 yt=None,
                 algorithms=list(),
                 copy=True,
                 verbose=1,
                 random_state=None,
                 **params):
               
        if not hasattr(estimator, "tree_"):
            raise ValueError("`estimator` argument has no ``tree_`` attribute, "
                                "please call `fit` on `estimator` or use "
                                "another estimator as `DecisionTreeClassifier`.")
        
        estimator = check_fitted_estimator(estimator)
        
        super().__init__(estimator=estimator,
                         Xt=Xt,
                         yt=yt,
                         copy=copy,
                         verbose=verbose,                       
                         **params)

        
        if len(algorithms) == 0:
            print('Warning : empty list of methods. Default are Source and Target models.')
            self.algorithms = ['src','trgt']
        else:
            self.algorithms = algorithms
        
        self.n_methods = len(self.algorithms)
        self.scores = np.zeros(self.n_methods)
        
        self.best_score = 0
        self.best_index = -1
        
        self.transferred_models = list()
        
        for algo in self.algorithms:
            self.transferred_models.append(TransferTreeClassifier(estimator=self.estimator,Xt=self.Xt,yt=self.yt,algo=algo,copy=self.copy))
            
      
    def fit(self, Xt=None, yt=None, **fit_params):
        
        Xt, yt = self._get_target_data(Xt, yt)
        Xt, yt = check_arrays(Xt, yt)
        set_random_seed(self.random_state)
        
        for k,algo in enumerate(self.algorithms):
            self.transferred_models[k].fit(Xt=Xt, yt=yt,**fit_params)
            
    def select(self,Xtest=None,ytest=None,score_type="auc"):
        
        Xtest, ytest = self._get_target_data(Xtest, ytest)
        Xtest, ytest = check_arrays(Xtest, ytest)
        set_random_seed(self.random_state)
        
        for k in range(len(self.algorithms)):
            if score_type == "auc":
                self.scores[k] = _auc_(ytest,self.transferred_models[k].estimator_.predict_proba(Xtest)[:,1]) 
            else:
                self.scores[k] = self.transferred_models[k].score(Xtest,ytest) 
        
        self.best_score = np.amax(self.scores)
        self.best_index = np.argmax(self.scores)
        self.best_method = self.algorithms[self.best_index]
        
        return self.best_score, self.best_index
    

class TransferForestSelector(BaseAdaptEstimator):
    """
    """
    def __init__(self,
                 estimator=None,
                 Xt=None,
                 yt=None,
                 algorithms=list(),
                 bootstrap=True,
                 copy=True,
                 verbose=1,
                 random_state=None,
                 **params):
        
        if not isinstance(estimator, RandomForestClassifier):
            raise ValueError("`estimator` argument must be a ``RandomForestClassifier`` instance, got %s."%str(type(estimator)))

        if not hasattr(estimator, "estimators_"):
            raise ValueError("`estimator` argument has no ``estimators_`` attribute, "
                                "please call `fit` on `estimator`.")
        
        estimator = check_fitted_estimator(estimator)
        
        super().__init__(estimator=estimator,
                         Xt=Xt,
                         yt=yt,
                         copy=copy,
                         verbose=verbose,
                         random_state=random_state,                       
                         bootstrap=bootstrap,
                         **params)
        
        self.estimator_ = check_estimator(self.estimator,
                                          copy=self.copy,
                                          force_copy=True)
        
        estimator = check_fitted_estimator(estimator)
        
        super().__init__(estimator=estimator,
                         Xt=Xt,
                         yt=yt,
                         copy=copy,
                         bootstrap=bootstrap,
                         verbose=verbose,                       
                         **params)

        
        if len(algorithms) == 0:
            print('Warning : empty list of methods. Default are Source and Target models.')
            self.algorithms = ['src','trgt']
        else:
            self.algorithms = algorithms
        
        self.rf_size = self.estimator_.n_estimators
        
        self.n_methods = len(self.algorithms)
        self.scores = np.zeros(self.n_methods)
        
        self.best_score = 0
        self.best_index = -1
        
        self.transferred_models = list()
        
        for algo in self.algorithms:
            self.transferred_models.append(TransferForestClassifier(estimator=self.estimator,Xt=self.Xt,yt=self.yt,algo=algo,bootstrap=self.bootstrap,copy=self.copy))
            
        self.STRF_model = TransferForestClassifier(estimator=self.estimator,Xt=self.Xt,yt=self.yt,algo=algo,bootstrap=self.bootstrap,copy=self.copy)
        self.STRF_indexes = np.zeros(self.rf_size)
        
    def model_selection(self, Xt=None, yt=None, score_type = "auc", oob_ = False, **fit_params):
        
        Xt, yt = self._get_target_data(Xt, yt)
        Xt, yt = check_arrays(Xt, yt)
        set_random_seed(self.random_state)
        
        rf_out = copy.deepcopy(self.estimator)
        
        for k in range(self.rf_size):
            #TTS = TransferTreeSelector(estimator=self.STRF_model.estimator_.estimators_[k],algorithms=self.algorithms)
            TTS = TransferTreeSelector(estimator=self.estimator_.estimators_[k],algorithms=self.algorithms)
            
            if self.bootstrap:
                inds, oob_inds = ut._bootstrap_(yt.size,class_wise=True,y=yt)
                TTS.fit(Xt[inds],yt[inds])                
                if oob_:
                    score, index = TTS.select(Xtest=Xt[oob_inds],ytest=yt[oob_inds],score_type=score_type)
                else:
                    score, index = TTS.select(Xtest=Xt[inds],ytest=yt[inds],score_type=score_type)
            else:
                TTS.fit(Xt,yt)                
                score, index = TTS.select(Xtest=Xt,ytest=yt,score_type=score_type)                 

            self.STRF_indexes[k] = index
            
            self.STRF_model.estimators_[k] = TTS.transferred_models[index]
            
            for j,m in enumerate(self.transferred_models):
                #rf_out_alg = copy.deepcopy(rf_out)
                m.estimators_[k] = TTS.transferred_models[j]
                m.estimator_.estimators_[k] = TTS.transferred_models[j].estimator_
                #m.estimator_ = rf_out_alg
                
            rf_out.estimators_[k] = TTS.transferred_models[index].estimator_
        
        self.STRF_model.estimator_ = rf_out
        self.estimator_ = rf_out
        
        return self.STRF_indexes

<h3>1. Balanced data </h3>

In [None]:
path_save = ''

name_bdd_sim = 'bdd_sim_test_ech5_win250_marg50.csv'
name_bdd_real = 'bdd_realfalls_test_ech1_win250_marg5.csv'

M_sim = pd.read_csv(path_save+name_bdd_sim)
names = np.array(M_sim['fname'])
Y_source = np.array(M_sim['Label'])
X_source = np.array(M_sim.iloc[:,1:-2]).astype('float32')

M_real = pd.read_csv(path_save+name_bdd_real)
names = np.array(M_real['fname'])
Y_target = np.array(M_real['Label'])
X_target = np.array(M_real.iloc[:,1:-2]).astype('float32')

In [None]:
DT_only = False
K_FOLD = 5
MAX = 5
RF_SIZE = 10
n_ech_src = 5
n_ech_tgt = 1

list_feats = np.arange(0,10,1)

X_source = X_source[::n_ech_src,list_feats] 
Y_source = Y_source[::n_ech_src] 
X_target = X_target[::n_ech_tgt,list_feats]
Y_target = Y_target[::n_ech_tgt]

# Source classifier
if DT_only:
    clf_source = DecisionTreeClassifier(max_depth=MAX)
    clf_target = DecisionTreeClassifier(max_depth=MAX)
else:
    clf_source = RandomForestClassifier(n_estimators=RF_SIZE,max_depth=MAX)
    clf_target = RandomForestClassifier(n_estimators=RF_SIZE,max_depth=MAX)

### Question 1.1 :

Train two Random Forests (source and target) and assess them both on source and target using cross-validation.

In [None]:
skf = select.StratifiedKFold(n_splits=K_FOLD)

score_src = np.zeros(K_FOLD)
score_src_tgt = np.zeros(K_FOLD)
score_tgt_src = np.zeros(K_FOLD)
score_tgt = np.zeros(K_FOLD)

auc_score_src = np.zeros(K_FOLD)
auc_score_src_tgt = np.zeros(K_FOLD)
auc_score_tgt_src = np.zeros(K_FOLD)
auc_score_tgt = np.zeros(K_FOLD)

tpr_roc_src = np.zeros(K_FOLD,dtype=object)
tpr_roc_tgt = np.zeros(K_FOLD,dtype=object)
tpr_roc_src_tgt = np.zeros(K_FOLD,dtype=object)
tpr_roc_tgt_src = np.zeros(K_FOLD,dtype=object)
fpr_roc_src = np.zeros(K_FOLD,dtype=object)
fpr_roc_tgt = np.zeros(K_FOLD,dtype=object)
fpr_roc_src_tgt = np.zeros(K_FOLD,dtype=object)
fpr_roc_tgt_src = np.zeros(K_FOLD,dtype=object)
th_roc_src = np.zeros(K_FOLD,dtype=object)
th_roc_tgt = np.zeros(K_FOLD,dtype=object)
th_roc_src_tgt = np.zeros(K_FOLD,dtype=object)
th_roc_tgt_src = np.zeros(K_FOLD,dtype=object)


k=0
for train, test in skf.split(X_source, Y_source):
    X_src_train, Y_src_train, X_src_test, Y_src_test = X_source[train], Y_source[train], X_source[test], Y_source[test]
    clf_source.fit(X_src_train, Y_src_train)
    score_src[k] = clf_source.score(X_src_test, Y_src_test)
    score_src_tgt[k] = clf_source.score(X_target, Y_target)
    auc_score_src[k] = metr.roc_auc_score(Y_src_test,clf_source.predict_proba(X_src_test)[:,1])
    auc_score_src_tgt[k] = metr.roc_auc_score(Y_target,clf_source.predict_proba(X_target)[:,1])
    
    fpr_roc_src[k],tpr_roc_src[k],th_roc_src[k] = metr.roc_curve(Y_src_test,clf_source.predict_proba(X_src_test)[:,1])
    fpr_roc_src_tgt[k],tpr_roc_src_tgt[k],th_roc_src_tgt[k] = metr.roc_curve(Y_target,clf_source.predict_proba(X_target)[:,1])
    
    k+=1

k=0
for train, test in skf.split(X_target, Y_target):
    X_tgt_train, Y_tgt_train, X_tgt_test, Y_tgt_test = X_target[train], Y_target[train], X_target[test], Y_target[test]
    clf_target.fit(X_tgt_train, Y_tgt_train)
    score_tgt[k] = clf_target.score(X_tgt_test, Y_tgt_test)
    score_tgt_src[k] = clf_target.score(X_source, Y_source)
    auc_score_tgt[k] = metr.roc_auc_score(Y_tgt_test,clf_target.predict_proba(X_tgt_test)[:,1])
    auc_score_tgt_src[k] = metr.roc_auc_score(Y_source,clf_target.predict_proba(X_source)[:,1])
    
    fpr_roc_tgt[k],tpr_roc_tgt[k],th_roc_tgt[k] = metr.roc_curve(Y_tgt_test,clf_target.predict_proba(X_tgt_test)[:,1])
    fpr_roc_tgt_src[k],tpr_roc_tgt_src[k],th_roc_tgt_src[k]  = metr.roc_curve(Y_source,clf_target.predict_proba(X_source)[:,1])
 
    k+=1
    
print('Score Target model: {:.3f}'.format(np.mean(score_tgt)))
print('Score Source model: {:.3f}'.format(np.mean(score_src)))
print('Score Source model on Target: {:.3f}'.format(np.mean(score_src_tgt)))
print('Score Target model on Source: {:.3f}'.format(np.mean(score_tgt_src)))

print('ROC AUC Target model: {:.3f}'.format(np.mean(auc_score_tgt)))
print('ROC AUC Source model: {:.3f}'.format(np.mean(auc_score_src)))
print('ROC AUC Source model on Target: {:.3f}'.format(np.mean(auc_score_src_tgt)))
print('ROC AUC Target model on Source: {:.3f}'.format(np.mean(auc_score_tgt_src)))

### Question 1.2 :

Plot four ROC curves corresponding to each situation.
Why does this motivate a transfer learning approach ?

In [None]:
algo_list = ['src','trgt','relab','ser','strut']
n_versions = len(algo_list)

auc_score_strf = np.zeros(K_FOLD)
auc_score_strut = np.zeros(K_FOLD)
auc_score_ser = np.zeros(K_FOLD)
auc_score_strf_src = np.zeros(K_FOLD)
auc_score_strut_src = np.zeros(K_FOLD)
auc_score_ser_src = np.zeros(K_FOLD)

score_strf = np.zeros(K_FOLD)
score_strut = np.zeros(K_FOLD)
score_ser = np.zeros(K_FOLD)

count_ind = np.zeros((K_FOLD,n_versions))

k=0
for train, test in skf.split(X_target, Y_target):
    X_tgt_train, Y_tgt_train, X_tgt_test, Y_tgt_test = X_target[train], Y_target[train], X_target[test], Y_target[test]

    if DT_only:
        TTS = TransferTreeSelector(estimator=clf_source,algorithms=algo_list)
        TTS.fit(X_tgt_train,Y_tgt_train)
        strut_model = TTS.transferred_models[algo_list.index("strut")]
        ser_model = TTS.transferred_models[algo_list.index("ser")]
        
        score_strut[k] = strut_model.score(X_tgt_test,Y_tgt_test) 
        score_ser[k] = ser_model.score(X_tgt_test,Y_tgt_test) 
        
        #print(TTS.select(X_tgt_test, Y_tgt_test))
    else:
        TFS = TransferForestSelector(estimator=clf_source,algorithms=algo_list)
        TFS.model_selection(X_tgt_train,Y_tgt_train,score_type="auc",oob_=True)
        score_strf[k] = TFS.STRF_model.score(X_tgt_test,Y_tgt_test) 
         
        strut_model = TFS.transferred_models[algo_list.index("strut")]
        ser_model = TFS.transferred_models[algo_list.index("ser")]
        
        score_strut[k] = strut_model.score(X_tgt_test,Y_tgt_test) 
        score_ser[k] = ser_model.score(X_tgt_test,Y_tgt_test) 
        
        auc_score_strf[k] = metr.roc_auc_score(Y_tgt_test,TFS.STRF_model.estimator_.predict_proba(X_tgt_test)[:,1])
        auc_score_strut[k] = metr.roc_auc_score(Y_tgt_test,strut_model.estimator_.predict_proba(X_tgt_test)[:,1])
        auc_score_ser[k] = metr.roc_auc_score(Y_tgt_test,ser_model.estimator_.predict_proba(X_tgt_test)[:,1])

        auc_score_strf_src[k] = metr.roc_auc_score(Y_source,TFS.STRF_model.estimator_.predict_proba(X_source)[:,1])
        auc_score_strut_src[k] = metr.roc_auc_score(Y_source,strut_model.estimator_.predict_proba(X_source)[:,1])
        auc_score_ser_src[k] = metr.roc_auc_score(Y_source,ser_model.estimator_.predict_proba(X_source)[:,1])
        
        counts = np.zeros(n_versions)
        for u in range(n_versions):
            counts[u] = list(TFS.STRF_indexes).count(u)
        count_ind[k] = counts/RF_SIZE  
    k+=1

print('SCORES:')
print(np.mean(score_strut))
print(np.mean(score_ser))
print(np.mean(score_strf))
print('AUC SCORES:')
print(np.mean(auc_score_strut))
print(np.mean(auc_score_ser))
print(np.mean(auc_score_strf))
print('AUC SCORES ON SRC:')
print(np.mean(auc_score_strut_src))
print(np.mean(auc_score_ser_src))
print(np.mean(auc_score_strf_src))

In [None]:
df = pd.DataFrame(count_ind, columns = algo_list)
ax = sns.barplot(data=df)

<h3>2. Imbalanced data </h3>

In [None]:
path_save = ''

name_bdd_sim = 'bdd_sim_test_ech5_win250_marg50.csv'
name_bdd_real = 'IMB_bdd_realfalls_test_ech1_win250_marg5.csv'

M_sim = pd.read_csv(path_save+name_bdd_sim)
names = np.array(M_sim['fname'])
Y_source = np.array(M_sim['Label'])
X_source = np.array(M_sim.iloc[:,1:-2]).astype('float32')

M_real = pd.read_csv(path_save+name_bdd_real)
names = np.array(M_real['fname'])
Y_target = np.array(M_real['Label'])
X_target = np.array(M_real.iloc[:,1:-2]).astype('float32')

### (Bonus) Question 2 : Experiment every desired transfer algorithm on these new imbalanced data.