Import Statements:

In [15]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from varclushi import VarClusHi
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold

Reading input Dataset:

In [16]:
df = pd.read_csv('./wine.csv', sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [17]:
def filter_missing_rate(df, threshold=0.15):
    missing_report = (
        df
        .isnull()
        .sum()
        .rename('MISSING_COUNT')
        .reset_index()
        .rename(columns={"index":"VARIABLE_NAME"})
        .assign(
            MISSING_PERCENTAGE = lambda x: x['MISSING_COUNT']/df.shape[0]
        )
    )
    required_columns = missing_report['VARIABLE_NAME'][
        missing_report['MISSING_PERCENTAGE']<=threshold
    ]
    return required_columns, df[required_columns]

selected_features, df = filter_missing_rate(df, threshold=0.15)
print(selected_features)
df.head()

0            fixed acidity
1         volatile acidity
2              citric acid
3           residual sugar
4                chlorides
5      free sulfur dioxide
6     total sulfur dioxide
7                  density
8                       pH
9                sulphates
10                 alcohol
11                 quality
Name: VARIABLE_NAME, dtype: object


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Introducing a constant column for testing:

In [18]:
df['semi-constant column'] = (
    (['majority class'] * int(df.shape[0]*0.90))
    + (['minority class'] * (df.shape[0] - int(df.shape[0]*0.90)))
)
df['semi-constant column'].value_counts(normalize=True)

majority class    0.899937
minority class    0.100063
Name: semi-constant column, dtype: float64

In [19]:
def filter_constant_columns_new(df, threshold=0.90):
    '''
        This function will filter and remove columns with proportion of a value >= threshold value.
    '''
    columns_to_keep, columns_to_remove = [], []
    for column in df.columns:
        temp_proportions = df[column].value_counts(normalize=True)
        if temp_proportions.max() <= threshold:
            columns_to_keep.append(column)
        else:
            columns_to_remove.append(column)
    return columns_to_remove, df[columns_to_keep]

selected_features, df = filter_constant_columns_new(df, threshold=0.75)
print(selected_features)
df.head()

['semi-constant column']


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [20]:
def filter_constant_columns(df, threshold=0.05):
    '''
        This function will filter and remove columns that have variance below the specified threshold.
    '''
    constant_filter = VarianceThreshold(threshold=threshold).fit(df)
    filtered_df = pd.DataFrame(
        constant_filter.transform(df),
        columns=constant_filter.get_feature_names_out()
    )
    return constant_filter.get_feature_names_out(), filtered_df

selected_features, df = filter_constant_columns(df, threshold=0.15)
print(selected_features)
df.head()

['fixed acidity' 'residual sugar' 'free sulfur dioxide'
 'total sulfur dioxide' 'alcohol' 'quality']


Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,total sulfur dioxide,alcohol,quality
0,7.4,1.9,11.0,34.0,9.4,5.0
1,7.8,2.6,25.0,67.0,9.8,5.0
2,7.8,2.3,15.0,54.0,9.8,5.0
3,11.2,1.9,17.0,60.0,9.8,6.0
4,7.4,1.9,11.0,34.0,9.4,5.0


In [21]:
x_train = df[[i for i in df.columns if i!= 'alcohol']]
y_train = df['alcohol']
x_train.head()

Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,total sulfur dioxide,quality
0,7.4,1.9,11.0,34.0,5.0
1,7.8,2.6,25.0,67.0,5.0
2,7.8,2.3,15.0,54.0,5.0
3,11.2,1.9,17.0,60.0,6.0
4,7.4,1.9,11.0,34.0,5.0


In [22]:
def cluster_variables(
    df,
    top_n_features=1,
    maxeigenval2=1,
    max_clus=None,
    column_subset=None
):
    if column_subset is None:
        variable_cluster_obj = VarClusHi(
            df,
            maxeigval2=maxeigenval2,
            maxclus=max_clus
        )
    else:
        variable_cluster_obj = VarClusHi(
            df[column_subset],
            maxeigval2=maxeigenval2,
            maxclus=max_clus
        )

    variable_cluster_model = variable_cluster_obj.varclus()
    
    vc_rsquare = variable_cluster_model.rsquare
    vc_rsquare.sort_values(
        by=["Cluster", "RS_Own"], ascending=False, inplace=True
    )
    top_n_variables = (
        vc_rsquare.groupby(["Cluster"]).head(top_n_features).reset_index()
    )

    selected_features = list(set(top_n_variables["Variable"]))

    return selected_features, vc_rsquare

selected_features, variable_cluster_output = cluster_variables(x_train)
print(selected_features)
variable_cluster_output

['total sulfur dioxide', 'fixed acidity']


Unnamed: 0,Cluster,Variable,RS_Own,RS_NC,RS_Ratio
3,1,fixed acidity,0.562026,0.01089,0.442796
4,1,quality,0.562026,0.013157,0.443813
1,0,total sulfur dioxide,0.78666,0.039576,0.222131
0,0,free sulfur dioxide,0.777796,0.018593,0.226414
2,0,residual sugar,0.202428,0.007346,0.803474


In [23]:
df = df[list(selected_features)+['alcohol']]
x_train = x_train[list(selected_features)]
df.head()

Unnamed: 0,total sulfur dioxide,fixed acidity,alcohol
0,34.0,7.4,9.4
1,67.0,7.8,9.8
2,54.0,7.8,9.8
3,60.0,11.2,9.8
4,34.0,7.4,9.4


In [24]:
def clean_feature_importance(estimator, variable_names):
    feature_importance = (
        pd.DataFrame({
            'VARIABLE_NAME': variable_names,
            'FEATURE_IMPORTANCE': estimator.feature_importances_
        }).sort_values(by=['FEATURE_IMPORTANCE'], ascending=True)
        .reset_index(drop=True)
    )
    return feature_importance

def remove_zero_importances_regression(
    x_train,
    y_train,
    column_subset=None,
    verbose=True
):

    if column_subset != None:
        x_train = x_train[column_subset]

    estimator = xgb.XGBRegressor(seed=1024)
    base_model = estimator.fit(x_train, y_train)

    feature_importance = clean_feature_importance(
        estimator=base_model,
        variable_names = x_train.columns
    )

    selected_features = feature_importance['VARIABLE_NAME'].to_list()
    
    required_columns = x_train.columns

    for variable_index in range(len(required_columns)):
        if feature_importance['FEATURE_IMPORTANCE'].min() == 0:
            feature_importance = feature_importance[feature_importance['FEATURE_IMPORTANCE']>0]
            selected_features = feature_importance['VARIABLE_NAME'].tolist()
            base_model = estimator.fit(x_train[selected_features], y_train)
            feature_importance = clean_feature_importance(
                estimator=base_model,
                variable_names=selected_features
            )
            if verbose:
                print(feature_importance)

    feature_importance = clean_feature_importance(
        estimator=base_model,
        variable_names=selected_features
    )

    feature_importance = feature_importance[feature_importance['FEATURE_IMPORTANCE']>0]
    selected_features = feature_importance['VARIABLE_NAME'].tolist()

    print(feature_importance)

    return selected_features

selected_features = remove_zero_importances_regression(x_train, y_train)

          VARIABLE_NAME  FEATURE_IMPORTANCE
0  total sulfur dioxide            0.461162
1         fixed acidity            0.538838


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [25]:
def select_var_importance(
    x_train,
    y_train,
    x_test,
    y_test,
    column_subset=None
):
    if column_subset != None:
        x_train = x_train[column_subset]
        x_test = x_test[column_subset]

    selected_features = list(x_train.columns)
    model_features = selected_features
    estimator = xgb.XGBRegressor(seed=1024)
    selection_model = estimator.fit(x_train, y_train)
    base_model = selection_model
    thresholds = np.sort(selection_model.feature_importances_)
    
    output = []
    
    for threshold in thresholds:
        selection = SelectFromModel(selection_model, threshold=threshold, prefit=True)
        selected_x_train = selection.transform(x_train)
        selection_model = xgb.XGBRegressor(seed=1024)
        selection_model.fit(selected_x_train, y_train)
        selected_x_test = selection.transform(x_test)
        y_pred = selection_model.predict(selected_x_test)
        selected_error = mean_absolute_error(y_test, y_pred)
        temp_df = pd.DataFrame({
            "THRESHOLD": [threshold],
            "VARIABLE_COUNT": [selected_x_train.shape[1]],
            "ERROR": [selected_error]
        })
        output.append(temp_df)
    
    output = pd.merge(
        right=pd.concat(output, axis=0).reset_index(drop=True),
        left=clean_feature_importance(base_model, model_features),
        how='outer',
        left_index=True,
        right_index=True
    ).drop(columns=['THRESHOLD', 'VARIABLE_COUNT'])
    
    selected_threshold = (
        output['FEATURE_IMPORTANCE'][output['ERROR']==output['ERROR'].min()].iloc[0]
    )
    
    selected_features = set(output.loc[
        output['FEATURE_IMPORTANCE']>=selected_threshold
    ]['VARIABLE_NAME'])

    return list(selected_features), output

selected_features, output = select_var_importance(
    x_train,
    y_train,
    x_train,
    y_train,
    column_subset=None
)

print(selected_features)

output

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


['total sulfur dioxide', 'fixed acidity']


Unnamed: 0,VARIABLE_NAME,FEATURE_IMPORTANCE,ERROR
0,total sulfur dioxide,0.461162,0.367368
1,fixed acidity,0.538838,0.77333


In [26]:
x_train = x_train[selected_features]

In [27]:
def select_best_model(
    x_train,
    y_train,
    x_test,
    y_test,
    parameter_grid={
        'objective': ['reg:squarederror'],
        'learning_rate': [0.03, 0.05, 0.07]
    },
    column_subset=None,
    k_cv = 5
):
    if column_subset != None:
        x_train = x_train[column_subset]
        x_test = x_test[column_subset]

    estimator = xgb.XGBRegressor(seed=1024)

    gridsearch = GridSearchCV(
        estimator=estimator,
        param_grid=parameter_grid,
        cv=k_cv,
        n_jobs=-1,
        verbose=True
    )

    gridsearch.fit(x_train, y_train)

    return gridsearch.best_estimator_, gridsearch.best_params_

In [28]:
best_model, best_parameters = select_best_model(
    x_train,
    y_train,
    x_train,
    y_train,
)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [29]:
best_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=1024,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1024,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [30]:
best_parameters

{'learning_rate': 0.05, 'objective': 'reg:squarederror'}