In [1]:
# !pip3 install adapt

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.base import clone
from sklearn.utils.validation import check_array

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier

from adapt.instance_based import TrAdaBoost
from adapt.base import BaseAdaptEstimator
from scipy.sparse import vstack, issparse

In [5]:
datafile = 'flare.dat'
df = pd.read_csv(datafile, sep="\s+", header=None)

cat = pd.DataFrame()
cat[0] = df[0]
cat[1] = df[1]
cat[2] = df[11]
df = df.drop([0,1,11], axis=1)

cat = cat.apply(LabelEncoder().fit_transform)
df[0], df[1], classes = cat[0], cat[1], cat[2]

for i in range(2,11):
    df[i] = df[i].str.replace(',', '').astype(int)

model = KMeans(n_clusters=2)
model.fit(df)
y_pred = pd.DataFrame(model.predict(df))
clusters = model.fit_predict(df)

df['y_pred'] = y_pred

Xt = df[df['y_pred'] == 0]
yt = classes[df['y_pred']==0]
Xs = df[df['y_pred'] == 1]
ys = classes[df['y_pred'] == 1]

split = int(len(Xt)/2)
Xtest = Xt[split:]
ytest = yt[split:]
Xt = Xt[:split]
yt = yt[:split]
Xs.shape, Xt.shape, Xtest.shape, ys.shape, yt.shape, ytest.shape

((778, 12), (144, 12), (144, 12), (778,), (144,), (144,))

In [6]:
class OwnTrAdaBoost(object):
    def __init__(self,N=10,base_estimator=DecisionTreeClassifier(),score=roc_auc_score):    
        self.N=N
        self.base_estimator=base_estimator
        self.score=score
        self.beta_all = None
        self.estimators=[]
            
    def _calculate_weights(self,weights): 
        weights = weights.ravel()     
        total = np.sum(weights)   
        print(total,np.min(weights),np.max(weights))   
        return np.asarray(weights / total, order='C')      
                    
    def _calculate_error_rate(self,y_true, y_pred, weight):      
        weight = weight.ravel()
        total = np.sum(weight) 
        print(total,np.min(weight),np.max(weight))     
        return np.sum(weight / total * np.abs(y_true ^ y_pred))      
             
    def fit(self,source,target,source_label,target_label):
        source_shape=source.shape[0]
        target_shape=target.shape[0]
        trans_data = np.concatenate((source, target), axis=0)      
        trans_label = np.concatenate((source_label,target_label), axis=0)      
        weights_source = np.ones([source_shape, 1])/source_shape      
        weights_target = np.ones([target_shape, 1])/target_shape
        weights = np.concatenate((weights_source, weights_target), axis=0)
        
        bata = 1 / (1 + np.sqrt(2 * np.log(source_shape) / self.N))    
        self.beta_all = np.zeros([1, self.N])
        result_label = np.ones([source_shape+target_shape, self.N])    

        trans_data = np.asarray(trans_data, order='C')
        trans_label = np.asarray(trans_label, order='C')     
        
        best_round = 0
        score=0
        flag=0
        
        for i in range(self.N):      
            P = self._calculate_weights(weights) 
            est = clone(self.base_estimator).fit(trans_data,trans_label,sample_weight=P.ravel())
            self.estimators.append(est)
            y_preds=est.predict(trans_data)
            result_label[:, i]=y_preds

            y_target_pred=est.predict(target)
            error_rate = self._calculate_error_rate(target_label, y_target_pred,  \
                                              weights[source_shape:source_shape + target_shape, :])  
            if error_rate >= 0.5 or error_rate == 0:      
                self.N = i
                print('early stop! due to error_rate=%.2f'%(error_rate))      
                break       

            self.beta_all[0, i] = error_rate / (1 - error_rate)      
     
            for j in range(target_shape):      
                weights[source_shape + j] = weights[source_shape + j] * \
                np.power(self.beta_all[0, i],(-np.abs(result_label[source_shape + j, i] - target_label[j])))
  
            for j in range(source_shape):      
                weights[j] = weights[j] * np.power(bata,np.abs(result_label[j, i] - source_label[j]))
                
            tp=self.score(target_label,y_target_pred)
            print('The '+str(i)+' rounds score is '+str(tp))

    def _predict_one(self, x):
        """
        Output the hypothesis for a single instance
        :param x: array-like
            target label of a single instance from each iteration in order
        :return: 0 or 1
        """
        x, N = check_array(x, ensure_2d=False), self.N
        # replace 0 by 1 to avoid zero division and remove it from the product
        beta = [self.beta_all[0,t] if self.beta_all[0,t] != 0 else 1 for t in range(int(np.ceil(N/2)), N)]
        cond = np.prod([b ** -x for b in beta]) >= np.prod([b ** -0.5 for b in beta])
        return int(cond)

    def predict(self, x_test):
        y_pred_list = np.array([est.predict(x_test) for est in self.estimators]).T
        y_pred = np.array(list(map(self._predict_one, y_pred_list)))
        return y_pred


In [7]:
%reload_ext autoreload
base_estimator = DecisionTreeClassifier(max_depth=2)
clf = OwnTrAdaBoost(N=3,base_estimator=base_estimator,score=accuracy_score)
clf.fit(Xs,Xt,ys,yt)

ys_pred = clf.predict(Xs)
yt_pred = clf.predict(Xt)
ytest_pred = clf.predict(Xtest)
print(np.unique(ys_pred))
print(np.unique(yt_pred))
print(np.unique(ytest_pred))
print('train acc:',accuracy_score(ys,ys_pred))
print('target acc:',accuracy_score(yt,yt_pred))
print('target_test acc:',accuracy_score(ytest,ytest_pred))

2.0 0.0012853470437017994 0.006944444444444444
1.0000000000000002 0.006944444444444444 0.006944444444444444
early stop! due to error_rate=1.24
[1]
[1]
[1]
train acc: 0.2352185089974293
target acc: 0.09027777777777778
target_test acc: 0.10416666666666667


In [8]:
%reload_ext autoreload
base_estimator = LinearSVC()
clf = OwnTrAdaBoost(N=3,base_estimator=base_estimator,score=accuracy_score)
clf.fit(Xs,Xt,ys,yt)

ys_pred = clf.predict(Xs)
yt_pred = clf.predict(Xt)
ytest_pred = clf.predict(Xtest)
print(np.unique(ys_pred))
print(np.unique(yt_pred))
print(np.unique(ytest_pred))
print('train acc:',accuracy_score(ys,ys_pred))
print('target acc:',accuracy_score(yt,yt_pred))
print('target_test acc:',accuracy_score(ytest,ytest_pred))

2.0 0.0012853470437017994 0.006944444444444444
1.0000000000000002 0.006944444444444444 0.006944444444444444
early stop! due to error_rate=1.30
[1]
[1]
[1]
train acc: 0.2352185089974293
target acc: 0.09027777777777778
target_test acc: 0.10416666666666667


In [9]:
%reload_ext autoreload
base_estimator = SVC()
clf = OwnTrAdaBoost(N=3,base_estimator=base_estimator,score=accuracy_score)
clf.fit(Xs,Xt,ys,yt)

ys_pred = clf.predict(Xs)
yt_pred = clf.predict(Xt)
ytest_pred = clf.predict(Xtest)
print(np.unique(ys_pred))
print(np.unique(yt_pred))
print(np.unique(ytest_pred))
print('train acc:',accuracy_score(ys,ys_pred))
print('target acc:',accuracy_score(yt,yt_pred))
print('target_test acc:',accuracy_score(ytest,ytest_pred))

2.0 0.0012853470437017994 0.006944444444444444
1.0000000000000002 0.006944444444444444 0.006944444444444444
early stop! due to error_rate=2.22
[1]
[1]
[1]
train acc: 0.2352185089974293
target acc: 0.09027777777777778
target_test acc: 0.10416666666666667


In [10]:
baseline = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=3)
baseline.fit(Xs,ys)
ys_pred = baseline.predict(Xs)
yt_pred = baseline.predict(Xt)
ytest_pred = baseline.predict(Xtest)
print('train acc:',accuracy_score(ys,ys_pred))
print('target acc:',accuracy_score(yt,yt_pred))
print('target_test acc:',accuracy_score(ytest,ytest_pred))

train acc: 0.87146529562982
target acc: 0.4722222222222222
target_test acc: 0.5069444444444444


In [11]:
ys = ys.reset_index(drop=True)
yt = yt.reset_index(drop=True)
ytest = ytest.reset_index(drop=True)

In [12]:
model = TrAdaBoost(GaussianProcessClassifier(), n_estimators=10, Xt=Xt, yt=yt, random_state=0)
model.fit(Xs, ys)

Iteration 0 - Error: 0.6785
Iteration 1 - Error: 0.6384
Iteration 2 - Error: 0.6083
Iteration 3 - Error: 0.5949
Iteration 4 - Error: 0.5748
Iteration 5 - Error: 0.5724
Iteration 6 - Error: 0.5796
Iteration 7 - Error: 0.5814
Iteration 8 - Error: 0.5906
Iteration 9 - Error: 0.5947


TrAdaBoost(Xt=     2  3  4  5  6  7  8  9  10  0  1  y_pred
0    1  3  1  1  1  1  0  0   0  0  3       0
5    1  2  1  1  2  1  0  0   0  0  2       0
7    1  3  1  1  2  1  0  0   0  0  2       0
9    1  2  1  1  2  1  0  0   0  0  2       0
31   1  3  1  1  2  1  3  1   0  0  0       0
..  .. .. .. .. .. .. .. ..  .. .. ..     ...
545  2  2  1  2  2  1  1  0   0  1  2       0
550  2  2  1  2  2  1  2  0   0  2  1       0
551  1  3  1  2  2  1  0  0   0  1  2       0
556  1  2  1  2  2  1  4  0   0  2  1       0
562  2  3  1  2  2  2  1  1   0  0  1       0

[144 rows x 12 columns],
           estimator=GaussianProcessClassifier(), random_state=0,
           yt=0      5
1      1
2      1
3      1
4      3
      ..
139    3
140    3
141    4
142    4
143    4
Name: 2, Length: 144, dtype: int64)

In [13]:
model.score(Xt, yt)

0.8125

In [14]:
model = TrAdaBoost(LinearSVC(), n_estimators=10, Xt=Xt, yt=yt, random_state=0)
model.fit(Xs, ys)

Iteration 0 - Error: 0.6963
Iteration 1 - Error: 0.6988
Iteration 2 - Error: 0.7014
Iteration 3 - Error: 0.7048
Iteration 4 - Error: 0.7065
Iteration 5 - Error: 0.7085
Iteration 6 - Error: 0.7117
Iteration 7 - Error: 0.7181
Iteration 8 - Error: 0.7310
Iteration 9 - Error: 0.7424


TrAdaBoost(Xt=     2  3  4  5  6  7  8  9  10  0  1  y_pred
0    1  3  1  1  1  1  0  0   0  0  3       0
5    1  2  1  1  2  1  0  0   0  0  2       0
7    1  3  1  1  2  1  0  0   0  0  2       0
9    1  2  1  1  2  1  0  0   0  0  2       0
31   1  3  1  1  2  1  3  1   0  0  0       0
..  .. .. .. .. .. .. .. ..  .. .. ..     ...
545  2  2  1  2  2  1  1  0   0  1  2       0
550  2  2  1  2  2  1  2  0   0  2  1       0
551  1  3  1  2  2  1  0  0   0  1  2       0
556  1  2  1  2  2  1  4  0   0  2  1       0
562  2  3  1  2  2  2  1  1   0  0  1       0

[144 rows x 12 columns],
           estimator=LinearSVC(), random_state=0,
           yt=0      5
1      1
2      1
3      1
4      3
      ..
139    3
140    3
141    4
142    4
143    4
Name: 2, Length: 144, dtype: int64)

In [15]:
model.score(Xt, yt)

0.5972222222222222

In [16]:
model = TrAdaBoost(MLPClassifier(), n_estimators=10, Xt=Xt, yt=yt, random_state=0)
model.fit(Xs, ys)

Iteration 0 - Error: 0.4409
Iteration 1 - Error: 0.4932
Iteration 2 - Error: 0.5182
Iteration 3 - Error: 0.5298
Iteration 4 - Error: 0.5123
Iteration 5 - Error: 0.5502
Iteration 6 - Error: 0.5608
Iteration 7 - Error: 0.5729
Iteration 8 - Error: 0.5810
Iteration 9 - Error: 0.5809


TrAdaBoost(Xt=     2  3  4  5  6  7  8  9  10  0  1  y_pred
0    1  3  1  1  1  1  0  0   0  0  3       0
5    1  2  1  1  2  1  0  0   0  0  2       0
7    1  3  1  1  2  1  0  0   0  0  2       0
9    1  2  1  1  2  1  0  0   0  0  2       0
31   1  3  1  1  2  1  3  1   0  0  0       0
..  .. .. .. .. .. .. .. ..  .. .. ..     ...
545  2  2  1  2  2  1  1  0   0  1  2       0
550  2  2  1  2  2  1  2  0   0  2  1       0
551  1  3  1  2  2  1  0  0   0  1  2       0
556  1  2  1  2  2  1  4  0   0  2  1       0
562  2  3  1  2  2  2  1  1   0  0  1       0

[144 rows x 12 columns],
           estimator=MLPClassifier(), random_state=0,
           yt=0      5
1      1
2      1
3      1
4      3
      ..
139    3
140    3
141    4
142    4
143    4
Name: 2, Length: 144, dtype: int64)

In [17]:
model.score(Xt, yt)

0.7361111111111112

In [18]:
model = TrAdaBoost(LinearDiscriminantAnalysis(), n_estimators=10, Xt=Xt, yt=yt, random_state=0)
model.fit(Xs, ys)

Iteration 0 - Error: 0.4802
Iteration 1 - Error: 0.5404
Iteration 2 - Error: 0.5832
Iteration 3 - Error: 0.6209
Iteration 4 - Error: 0.6611
Iteration 5 - Error: 0.6850
Iteration 6 - Error: 0.7009
Iteration 7 - Error: 0.7072
Iteration 8 - Error: 0.7053
Iteration 9 - Error: 0.7010


TrAdaBoost(Xt=     2  3  4  5  6  7  8  9  10  0  1  y_pred
0    1  3  1  1  1  1  0  0   0  0  3       0
5    1  2  1  1  2  1  0  0   0  0  2       0
7    1  3  1  1  2  1  0  0   0  0  2       0
9    1  2  1  1  2  1  0  0   0  0  2       0
31   1  3  1  1  2  1  3  1   0  0  0       0
..  .. .. .. .. .. .. .. ..  .. .. ..     ...
545  2  2  1  2  2  1  1  0   0  1  2       0
550  2  2  1  2  2  1  2  0   0  2  1       0
551  1  3  1  2  2  1  0  0   0  1  2       0
556  1  2  1  2  2  1  4  0   0  2  1       0
562  2  3  1  2  2  2  1  1   0  0  1       0

[144 rows x 12 columns],
           estimator=LinearDiscriminantAnalysis(), random_state=0,
           yt=0      5
1      1
2      1
3      1
4      3
      ..
139    3
140    3
141    4
142    4
143    4
Name: 2, Length: 144, dtype: int64)

In [19]:
model.score(Xt, yt)

0.4861111111111111

In [20]:
from adapt.base import BaseAdaptEstimator
from scipy.sparse import vstack, issparse

# We create here the AUX model which consist in a balanced weighting
# between instances from source and target domains.
class BalancedWeighting(BaseAdaptEstimator):

    def __init__(self, estimator=None, alpha=1., Xt=None, yt=None):
        super().__init__(estimator=estimator, alpha=alpha, Xt=Xt, yt=yt)

    def fit(self, Xs, ys, Xt=None, yt=None, **kwargs):
        Xt, yt = self._get_target_data(Xt, yt)
        if issparse(Xs):
            X = vstack((Xs, Xt))
        else:
            X = np.concatenate((Xs, Xt))
        y = np.concatenate((ys, yt))
        sample_weight = np.ones(X.shape[0])
        sample_weight[Xs.shape[0]:] *= (Xs.shape[0] / Xt.shape[0]) * self.alpha

        self.fit_estimator(X, y, sample_weight=sample_weight)

In [22]:
names = ["SVM", "SVMt", "AUX", "TrAdaBoost"]
scores = {k: [] for k in names}

for state in range(10):
    np.random.seed(state)
    if state == 0:
        print("Xs shape: %s, Xt shape: %s"%(str(Xs.shape), str(Xt.shape)))
    models = [
        LinearSVC(class_weight="balanced"),
        LinearSVC(class_weight="balanced"),
        BalancedWeighting(LinearSVC(class_weight="balanced"), alpha=4., Xt=Xtest, yt=ytest),
        TrAdaBoost(LinearSVC(class_weight="balanced"), n_estimators=100, verbose=0, Xt=Xtest, yt=ytest)
    ]
    for model, name in zip(models, names):
        model.fit(Xs, ys)
        scores[name].append(1-model.score(Xt, yt))

    print("Round %i : %s"%(state, str({k: v[-1] for k, v in scores.items()})))

Xs shape: (778, 12), Xt shape: (144, 12)
Round 0 : {'SVM': 0.5694444444444444, 'SVMt': 0.5694444444444444, 'AUX': 0.45833333333333337, 'TrAdaBoost': 0.5763888888888888}
Round 1 : {'SVM': 0.5694444444444444, 'SVMt': 0.5625, 'AUX': 0.4375, 'TrAdaBoost': 0.5833333333333333}
Round 2 : {'SVM': 0.5694444444444444, 'SVMt': 0.5625, 'AUX': 0.47916666666666663, 'TrAdaBoost': 0.5833333333333333}
Round 3 : {'SVM': 0.5625, 'SVMt': 0.5694444444444444, 'AUX': 0.5138888888888888, 'TrAdaBoost': 0.5763888888888888}
Round 4 : {'SVM': 0.5694444444444444, 'SVMt': 0.5625, 'AUX': 0.48611111111111116, 'TrAdaBoost': 0.5555555555555556}
Round 5 : {'SVM': 0.5694444444444444, 'SVMt': 0.5694444444444444, 'AUX': 0.5, 'TrAdaBoost': 0.5555555555555556}
Round 6 : {'SVM': 0.5694444444444444, 'SVMt': 0.5694444444444444, 'AUX': 0.4652777777777778, 'TrAdaBoost': 0.5833333333333333}
Round 7 : {'SVM': 0.5694444444444444, 'SVMt': 0.5694444444444444, 'AUX': 0.5277777777777778, 'TrAdaBoost': 0.5555555555555556}
Round 8 : {'SVM

In [23]:
error_mu = np.round(pd.DataFrame(pd.DataFrame(scores).mean(0), columns=["Error"]), 3).transpose().astype(str)
error_std = np.round(pd.DataFrame(pd.DataFrame(scores).std(0), columns=["Error"]), 3).transpose().astype(str)
display(error_mu + " (" + error_std + ")")

Unnamed: 0,SVM,SVMt,AUX,TrAdaBoost
Error,0.569 (0.002),0.567 (0.003),0.478 (0.031),0.569 (0.016)
