In [8]:
%reload_ext autoreload
%autoreload 2

In [9]:
import numpy as np
from scipy.special import expit
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from asa.feature_selection_methods import search_combination_OLS, search_combination_RF_reg, search_combination_RF_cls
from asa.dataset import Dataset

## Use func

### Regression

In [10]:
X = np.random.rand(100, 10)
y = X[:, 2] + 0.5 * X[:, 5] + 0.3 * X[:, 8] +  np.random.rand(100)

#### OLS

In [18]:
search_combination_OLS(X, y)

((2, 5),
 (<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x24fb0cb14c0>,
  <function asa.linear_model.linear_model.get_OLS_nd.<locals>.func(X)>))

In [19]:
search_combination_OLS(X, y, n_components=5)

((1, 2, 5, 7, 8),
 (<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x24fb0c68e80>,
  <function asa.linear_model.linear_model.get_OLS_nd.<locals>.func(X)>))

In [20]:
search_combination_OLS(X, y, n_components=5, metric='bic', allowe_small_n=True)

((2, 5, 8),
 (<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x24fb0a72df0>,
  <function asa.linear_model.linear_model.get_OLS_nd.<locals>.func(X)>))

In [21]:
X = np.random.rand(100, 10)
y = X[:, 2] + 0.5 * X[:, 5] + 0.1 * X[:, 8] +  np.random.rand(100)
y[10:20] = 10 

In [22]:
best_combination, best_results = search_combination_OLS(X, y)
best_combination

(5, 7)

In [23]:
best_results[0].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.073
Model:,OLS,Adj. R-squared:,0.054
Method:,Least Squares,F-statistic:,3.846
Date:,"Wed, 06 Dec 2023",Prob (F-statistic):,0.0247
Time:,21:01:12,Log-Likelihood:,-235.51
No. Observations:,100,AIC:,477.0
Df Residuals:,97,BIC:,484.8
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4200,0.678,0.620,0.537,-0.925,1.765
x1,1.9201,0.954,2.013,0.047,0.027,3.813
x2,1.4832,0.931,1.594,0.114,-0.364,3.330

0,1,2,3
Omnibus:,61.305,Durbin-Watson:,0.303
Prob(Omnibus):,0.0,Jarque-Bera (JB):,173.155
Skew:,2.383,Prob(JB):,2.51e-38
Kurtosis:,7.341,Cond. No.,5.22


In [24]:
best_combination, best_results = search_combination_OLS(X, y, is_sigma_clip=True)
best_combination

(2, 5)

In [25]:
best_results[0].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.536
Model:,OLS,Adj. R-squared:,0.525
Method:,Least Squares,F-statistic:,50.23
Date:,"Wed, 06 Dec 2023",Prob (F-statistic):,3.14e-15
Time:,21:01:14,Log-Likelihood:,-20.509
No. Observations:,90,AIC:,47.02
Df Residuals:,87,BIC:,54.52
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5883,0.079,7.454,0.000,0.431,0.745
x1,0.8999,0.118,7.627,0.000,0.665,1.134
x2,0.4857,0.124,3.913,0.000,0.239,0.732

0,1,2,3
Omnibus:,36.49,Durbin-Watson:,2.12
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5.943
Skew:,-0.068,Prob(JB):,0.0512
Kurtosis:,1.748,Cond. No.,5.29


#### RF

In [6]:
best = search_combination_RF_reg(X, y, CVS_kwargs={'n_jobs': -1})
best

((2, 5),
 CVBest(
     best_score=-0.1074317168760371, 
     best_estimator=RandomForestRegressor(max_depth=5, n_estimators=10)
 ))

In [7]:
best = search_combination_RF_reg(X, y, CVS_kwargs={'n_jobs': -1}, drop_estimator=True)
best

((2, 8),
 CVBest(
     best_score=-0.11426732952753926, 
     best_estimator=None
 ))

### Classification

In [7]:
X = np.random.rand(100, 10)
y = ((X[:, 2] + X[:, 5] + 0.3 * X[:, 8] + 0.2 * np.random.rand(100)) >
     1.2).astype(int)

#### RF

In [8]:
search_combination_RF_cls(X, y, CVS_kwargs={'n_jobs': -1})

((2, 5),
 CVBest(
     best_score=0.866540404040404, 
     best_estimator=RandomForestClassifier(max_depth=101, n_estimators=1000)
 ))

## Use dataset

In [11]:
X = np.random.rand(100, 10)
y = X[:, 2] + 0.5 * X[:, 5] + 0.3 * X[:, 8] + np.random.rand(100)

data = np.c_[X, y]

dataset = Dataset(
    data,
    names=['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'y'])

In [6]:
dataset.search_combination_OLS(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9'], 'y', 2)

('0.61  + 1.06 x2 + 0.56 x5',
 (2, 5),
 (<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x17637c63d60>,
  <function asa.linear_model.linear_model.get_OLS_nd.<locals>.func(X)>))

In [9]:
dataset.search_combination_OLS(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9'], 'y', 5, metric='bic', allowe_small_n=True)

('0.42  + 1.07 x2 + 0.58 x5 + 0.34 x8',
 (2, 5, 8),
 (<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x176389f3070>,
  <function asa.linear_model.linear_model.get_OLS_nd.<locals>.func(X)>))

In [12]:
dataset.search_combination_RF(
    ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9'],
    'y',
    n_components=2,
    problem_type='regression',
    CVS_kwargs={'n_jobs': -1})

('RF(x2, x5)',
 (2, 5),
 CVBest(
     best_score=-0.12322451580980445, 
     best_estimator=RandomForestRegressor(max_depth=5)
 ))

In [13]:
dataset.search_combination_RF(
    ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9'],
    'y',
    n_components=2,
    problem_type='regression',
    CVS_kwargs={'n_jobs': -1},
    drop_estimator=True)

('RF(x2, x5)',
 (2, 5),
 CVBest(
     best_score=-0.12526489379941472, 
     best_estimator=None
 ))