## **A**utomated **L**earning for **I**nsightful **C**omparison and **E**valuation - (ALICE)

In [1]:
import numpy as np
import pandas as pd
import os
cur_dir = os.getcwd()

In [5]:
## Just to test stuff out
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()

X = pd.DataFrame(data=data.data, columns=data.feature_names)
y = pd.DataFrame(data=data.target, columns=data.target_names)

df = pd.concat([X,y], axis=1)

In [6]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [7]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [8]:
# Discretize df

mean_target = df['MedHouseVal'].mean()
df_discrete = df.copy()

df_discrete['AboveMean'] = (df_discrete['MedHouseVal'] >= mean_target).astype(int)
df_discrete.drop('MedHouseVal', axis=1, inplace=True)

In [9]:
# Can just import entire module
import alice

In [10]:
# Import regression metrics
from alice.metrics.regress import mse, rmse, mae
# Import classification metrics
from alice.metrics.classify import accuracy, precision, recall, f1
# Import regression agreeability metric
from alice.agreeability.regress import pearson
# Import classification agreeability metric
from alice.agreeability.classify import cohen_kappa


In [11]:
# Import our demo search algorithm 
from alice.search_and_compare.sequential import BackEliminator

### Check functionality on a regression task

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [14]:
y = df['MedHouseVal']
X = df.drop('MedHouseVal', axis=1)


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=66)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [17]:
m1 = LinearRegression()
m2 = DecisionTreeRegressor()

In [18]:
seeker = BackEliminator(
    X=X_train,
    y=y_train,
    validation_data=(X_val, y_val),
    task_type='regression',
    criterion='rmse',
    agreeability='pearson'
)

In [19]:
results = seeker.compare_all_models(
    m1=m1,
    m2=m2
)

Initial run: fitted both models with full feature set.
------------------------------------------------------------------------------------------------------------------------------------------------------
Model 1 included: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']. RMSE: 0.7315
Model 2 included: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']. RMSE: 0.7272
------------------------------------------------------------------------------------------------------------------------------------------------------
Agreeability Coefficient (pearson): 0.7609
Iteration 1:
------------------------------------------------------------------------------------------------------------------------------------------------------
Results from best models:
Best Model 1 included: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'Latitude', 'Longitude']. RMSE: 0.7243
Best Model 2 included: [

In [20]:
results

[{'Best: M1 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M1 rmse': 0.7315223258924328,
  'Best: M2 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M2 rmse': 0.7271857510834681,
  'Best: Agreeability (pearson)': 0.7609497068607372,
  'All: M1 Mean rmse': 0.7315223258924328,
  'All: M1 STD rmse': 0,
  'All: M2 Mean rmse': 0.7271857510834681,
  'All: M2 STD rmse': 0,
  'All: Mean Agreeability (pearson)': 0.7609497068607372,
  'All: Agreeability St. Dev.': 0},
 {'Best: M1 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'Latitude',
   'Longitude'],
  'Best: M1 rmse': 0.7243210543253327,
  'Best: M2 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'Latitude',
   'Longitude'],
  'Best: M2 rmse'

In [21]:
# scores now callable
scores_m1 = seeker.scores_m1
scores_m2 = seeker.scores_m2


In [22]:
results

[{'Best: M1 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M1 rmse': 0.7315223258924328,
  'Best: M2 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M2 rmse': 0.7271857510834681,
  'Best: Agreeability (pearson)': 0.7609497068607372,
  'All: M1 Mean rmse': 0.7315223258924328,
  'All: M1 STD rmse': 0,
  'All: M2 Mean rmse': 0.7271857510834681,
  'All: M2 STD rmse': 0,
  'All: Mean Agreeability (pearson)': 0.7609497068607372,
  'All: Agreeability St. Dev.': 0},
 {'Best: M1 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'Latitude',
   'Longitude'],
  'Best: M1 rmse': 0.7243210543253327,
  'Best: M2 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'Latitude',
   'Longitude'],
  'Best: M2 rmse'

In [23]:
results_df = seeker.dataframe_from_results()

In [24]:
results_df

Unnamed: 0,Best: M1 Included Features,Best: M1 rmse,Best: M2 Included Features,Best: M2 rmse,Best: Agreeability (pearson),All: M1 Mean rmse,All: M1 STD rmse,All: M2 Mean rmse,All: M2 STD rmse,All: Mean Agreeability (pearson),All: Agreeability St. Dev.
0,"[MedInc, HouseAge, AveRooms, AveBedrms, Popula...",0.731522,"[MedInc, HouseAge, AveRooms, AveBedrms, Popula...",0.727186,0.76095,0.731522,0.0,0.727186,0.0,0.76095,0.0
1,"[MedInc, HouseAge, AveRooms, AveBedrms, Popula...",0.724321,"[MedInc, HouseAge, AveRooms, AveBedrms, Popula...",0.67469,0.772485,0.77273,0.058713,0.73859,0.056807,0.714518,0.077238
2,"[MedInc, HouseAge, AveRooms, AveBedrms, Latitu...",0.724321,"[MedInc, AveRooms, AveBedrms, Population, Lati...",0.676471,0.778097,0.76988,0.058396,0.751512,0.099926,0.721503,0.076749
3,"[MedInc, HouseAge, AveBedrms, Latitude, Longit...",0.729908,"[AveRooms, AveBedrms, Population, Latitude, Lo...",0.645674,0.746255,0.777709,0.05971,0.756671,0.136514,0.712632,0.089246
4,"[MedInc, HouseAge, Latitude, Longitude]",0.734025,"[AveRooms, AveBedrms, Latitude, Longitude]",0.63478,0.742479,0.822217,0.105224,0.871123,0.243695,0.571158,0.170837
5,"[MedInc, Latitude, Longitude]",0.741958,"[AveRooms, Latitude, Longitude]",0.641112,0.722854,0.845597,0.106522,0.921943,0.24483,0.526461,0.163912
6,"[MedInc, Latitude]",0.830187,"[Latitude, Longitude]",0.632067,0.59915,0.897744,0.089312,1.019976,0.278742,0.442169,0.148543
7,[MedInc],0.839073,[Longitude],1.00587,0.326271,1.0013,0.162228,1.025579,0.01971,0.307457,0.018813


In [25]:
seeker.plot_from_results()

### Check functionality on a classification task

In [26]:
df_discrete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   AboveMean   20640 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.4 MB


In [27]:
y = df_discrete['AboveMean']
X = df_discrete.drop('AboveMean', axis=1)


In [28]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=66)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [30]:
m1 = LogisticRegression(solver='liblinear')
m2 = DecisionTreeClassifier()

In [31]:
seeker_2 = BackEliminator(
    X=X_train,
    y=y_train,
    validation_data=(X_val, y_val),
    task_type='classification',
    criterion='f1',
    agreeability='cohen_kappa'
)

In [32]:
results_2 = seeker_2.compare_all_models(
    m1=m1,
    m2=m2
)

Initial run: fitted both models with full feature set.
------------------------------------------------------------------------------------------------------------------------------------------------------
Model 1 included: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']. F1: 0.7787
Model 2 included: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']. F1: 0.7937
------------------------------------------------------------------------------------------------------------------------------------------------------
Agreeability Coefficient (cohen_kappa): 0.6388
Iteration 1:
------------------------------------------------------------------------------------------------------------------------------------------------------
Results from best models:
Best Model 1 included: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'AveOccup', 'Latitude', 'Longitude']. F1: 0.7816
Best Model 2 included: ['Hou

In [33]:
results_2

[{'Best: M1 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M1 f1': 0.7787234042553192,
  'Best: M2 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M2 f1': 0.793725954424386,
  'Best: Agreeability (cohen_kappa)': 0.638759480434029,
  'All: M1 Mean f1': 0.7787234042553192,
  'All: M1 STD f1': 0,
  'All: M2 Mean f1': 0.793725954424386,
  'All: M2 STD f1': 0,
  'All: Mean Agreeability (cohen_kappa)': 0.638759480434029,
  'All: Agreeability St. Dev.': 0},
 {'Best: M1 Included Features': ['MedInc',
   'HouseAge',
   'AveRooms',
   'AveBedrms',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M1 f1': 0.7815533980582524,
  'Best: M2 Included Features': ['HouseAge',
   'AveRooms',
   'AveBedrms',
   'Population',
   'AveOccup',
   'Latitude',
   'Longitude'],
  'Best: M2 f1': 0.81315396

In [34]:
results_df_2 = seeker_2.dataframe_from_results()

In [35]:
results_df_2

Unnamed: 0,Best: M1 Included Features,Best: M1 f1,Best: M2 Included Features,Best: M2 f1,Best: Agreeability (cohen_kappa),All: M1 Mean f1,All: M1 STD f1,All: M2 Mean f1,All: M2 STD f1,All: Mean Agreeability (cohen_kappa),All: Agreeability St. Dev.
0,"[MedInc, HouseAge, AveRooms, AveBedrms, Popula...",0.778723,"[MedInc, HouseAge, AveRooms, AveBedrms, Popula...",0.793726,0.638759,0.778723,0.0,0.793726,0.0,0.638759,0.0
1,"[MedInc, HouseAge, AveRooms, AveBedrms, AveOcc...",0.781553,"[HouseAge, AveRooms, AveBedrms, Population, Av...",0.813154,0.580804,0.75701,0.031662,0.787283,0.022223,0.589679,0.056107
2,"[MedInc, HouseAge, AveRooms, AveOccup, Latitud...",0.780797,"[HouseAge, AveRooms, Population, AveOccup, Lat...",0.825853,0.580835,0.760636,0.026681,0.784259,0.047125,0.522839,0.057829
3,"[MedInc, HouseAge, AveOccup, Latitude, Longitude]",0.776399,"[HouseAge, AveRooms, Population, Latitude, Lon...",0.832384,0.574129,0.743861,0.045557,0.778355,0.052895,0.493417,0.079644
4,"[MedInc, AveOccup, Latitude, Longitude]",0.767023,"[HouseAge, AveRooms, Latitude, Longitude]",0.825283,0.55499,0.71762,0.073182,0.75252,0.081795,0.417598,0.158884
5,"[MedInc, AveOccup, Latitude]",0.745134,"[HouseAge, Latitude, Longitude]",0.829415,0.49292,0.68527,0.076862,0.7409,0.085175,0.368375,0.162134
6,"[MedInc, AveOccup]",0.738155,"[Latitude, Longitude]",0.839914,0.509477,0.615847,0.137868,0.665545,0.124074,0.257245,0.185601
7,[MedInc],0.691214,[Longitude],0.615832,0.220298,0.527757,0.163457,0.608412,0.00742,0.141172,0.079127


In [36]:
seeker_2.plot_from_results()

### Working Code for 3D Plot

```python

# Create a 3D scatter plot
fig = go.Figure(data=[
    go.Scatter3d(
        x=df.index + 1,
        y=df.iloc[:, 4],
        z=df.iloc[:, 1], 
        mode='lines+markers',
        name=f'{df.columns[1]}',
        text=df['Summary_M1'],
        hoverinfo='text'
    ),
    go.Scatter3d(
        x=df.index + 1,
        y=df.iloc[:, 4],
        z=df.iloc[:, 3], 
        mode='lines+markers',
        name=f'{df.columns[3]}',
        text=df['Summary_M2'],
        hoverinfo='text'
    )
])

# Update layout
fig.update_layout(
    title='Agreeability Coefficients and Model Scores Over Algorithm Iterations',
    scene=dict(
        xaxis_title='Iteration',
        yaxis_title='Agreeability',
        zaxis_title='Model Scores'
    ),
    hovermode='closest'
)

# Show the plot
fig.show()

```

### Dependencies

In [30]:
import numpy as np
import pandas as pd
import statsmodels
import sklearn
import scipy
import plotly
import matplotlib
import seaborn as sns
import tensorflow as tf


In [32]:
print(f'pandas: {pd.__version__}')
print(f'numpy: {np.__version__}')
print(f'statsmodels: {statsmodels.__version__}')
print(f'sklearn: {sklearn.__version__}')
print(f'scipy: {scipy.__version__}')
print(f'plotly: {plotly.__version__}')
print(f'matplotlib: {matplotlib.__version__}')
print(f'seaborn: {sns.__version__}')
print(f'tensorflow: {tf.__version__}')


pandas: 1.5.3
numpy: 1.20.3
statsmodels: 0.13.5
sklearn: 1.2.2
scipy: 1.10.0
plotly: 5.18.0
matplotlib: 3.3.4
seaborn: 0.11.1
tensorflow: 2.10.1
