# Reading Data


We aim to test Tirex on a house pricing dataset. To do so, we build a classifier after reducing the dimension with Tirex. The classifier tries to predict if the price of a given house exceeds a high treshhold (which we set here to the 90% quantile of Y).
For more informations about the dataset used, please check the website https://archive.ics.uci.edu/ml/datasets/Residential+Building+Data+Set . 

In [20]:
import pandas as pd
import numpy as np
from TIREX_src import TIREX
Target="V-9" # Target variable Price
bd=pd.read_excel("Residential-Building-Data-Set.xlsx",skiprows=1)
bd.drop(['V-10'],axis=1,inplace=True) # We delete the cost of the house to make the prediction task harder


In [21]:
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


q_90=bd[Target].quantile(.90)# getting the quantile


X=bd.drop([Target],axis=1).values
Y=bd[Target].values#



X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=24)

X_test_extreme=X_test[Y_test>=q_90]
Y_test_extreme=Y_test[Y_test>=q_90]


Y_train_classif =np.where(Y_train >= q_90,1,0)
Y_test_classif  =np.where(Y_test >= q_90,1,0)




# simulation description

TIREX is compared to other dimensionality reduction techniques: principal component analysis (PCA), Singular Value Decomposition
(SVD), Locally Linear Embedding (LLE), Isomap (Imp), and CUME. For the sake of comparison, the dimension of
the subspace output by all dimension reduction algorithms is set to d = 2. The
different methods are compared in terms of their AUC (Area under the ROC
Curve) on an independent test set.

# Results

In [22]:

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score as ACC
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from numpy import linalg as LA



#function to test the model
def test_model(algo,dim,model,X_train,Y_train,X_test,Y_test):
    predictions_storage=dict()
    for algo_name, algo_reduce  in algo.items():
            if algo_name=="all features":
                steps=[('sclr',StandardScaler()), ('m', model)]
            else:
                steps=[(algo_name,algo_reduce),('sclr',StandardScaler()), ('m', model)]
            
            model_f= Pipeline(steps=steps)
            model_f.fit(X_train,Y_train_classif)
            
            predictions_f=model_f.predict_proba(X_test)[:,1]
            
            print(f" AUC using {algo_name} on the  test set:", end=" ") 
            print(f"{AUC(Y_test_classif,predictions_f)} \n")
            predictions_storage[algo_name]=[AUC(Y_test_classif,predictions_f)]
    
    bd_results=pd.DataFrame(predictions_storage)          
    return predictions_storage


The value of the parameter k is set using cross validation

In [23]:
from math import sqrt
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor


dim= 2# Desired number of components
n=len(X_train) # Length of dataset
k_base=int(sqrt(n)) # extreme treshhold



L_k=[i*k_base for i in range(1,10)]
model=KNeighborsClassifier(n_neighbors=5)

def cross_val_k(X_train,Y_train,L_k,method):
    L_AUC_val=np.zeros((len(L_k),1))
    for i,k in enumerate(L_k):
        print("STEP N° :",i+1,"/",len(L_k))
        Tirex=TIREX(n_components=dim,k=k,method=method)
        Tirex.fit(X_train,Y_train)
        X_sliced_train=Tirex.transform(X_train)   
        kf= KFold(n_splits=5)
        regressor_Sliced=Pipeline([('sclr',StandardScaler()), ('m',model)])
        L_simu_auc=[]
        
        for (train_index, test_index) in kf.split(X_sliced_train,Y_train):

            X_train_cv,X_test_cv = X_sliced_train[train_index],X_sliced_train[test_index]
            Y_train_classif_cv,Y_test_classif_cv = Y_train_classif[train_index],Y_train_classif[test_index]

            regressor_Sliced.fit(X_train_cv,Y_train_classif_cv)


            Y_pred_class=regressor_Sliced.predict_proba(X_test_cv)[:,1]

            L_simu_auc.append(AUC(Y_test_classif_cv,Y_pred_class))
        L_AUC_val[i,:]=sum(L_simu_auc)/len(L_simu_auc)
    
    return L_k[np.argmax(L_AUC_val)]


k_opt_FO=cross_val_k(X_train,Y_train,L_k,"FO")
k_opt_SO=cross_val_k(X_train,Y_train,L_k,"SO")



STEP N° : 1 / 9
STEP N° : 2 / 9
STEP N° : 3 / 9
STEP N° : 4 / 9
STEP N° : 5 / 9
STEP N° : 6 / 9
STEP N° : 7 / 9
STEP N° : 8 / 9
STEP N° : 9 / 9
STEP N° : 1 / 9
STEP N° : 2 / 9
STEP N° : 3 / 9
STEP N° : 4 / 9
STEP N° : 5 / 9
STEP N° : 6 / 9
STEP N° : 7 / 9
STEP N° : 8 / 9
STEP N° : 9 / 9


In [24]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.manifold import Isomap,LocallyLinearEmbedding
from math import sqrt



algo={"all features":None,"tirex_first_order":TIREX(k=k_opt_FO,n_components=dim,method="FO"),"tirex_second_order":TIREX(k=k_opt_SO,n_components=dim,method="SO"),"CUME FO":TIREX(n_components=dim,method="FO",mode="CUME"),"CUME SO":TIREX(n_components=dim,method="SO",mode="CUME"),'PCA':PCA(n_components=dim),'SVD':TruncatedSVD(n_components=dim),"LLE":LocallyLinearEmbedding(n_components=dim,eigen_solver="dense"),"IMP":Isomap(n_components=dim,eigen_solver="dense")}




predictions_extreme_storage=test_model(algo,dim,model,X_train,Y_train,X_test,Y_test)


 AUC using all features on the  test set: 0.9118104118104118 

 AUC using tirex_first_order on the  test set: 0.95998445998446 

 AUC using tirex_second_order on the  test set: 0.9331779331779333 

 AUC using CUME FO on the  test set: 0.8174048174048174 

 AUC using CUME SO on the  test set: 0.8659673659673659 

 AUC using PCA on the  test set: 0.8092463092463092 

 AUC using SVD on the  test set: 0.8108003108003108 

 AUC using LLE on the  test set: 0.820901320901321 



  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.

 AUC using IMP on the  test set: 0.7766122766122767 



  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
