# Recipe

This notbook is used to defining the methodology for identifying important features in wine quality datasets.

## 1. Identify Important Features

In [None]:
def identyfy_three_important_features(evaluation_matrix_list): 
    '''
    This function identifies the three most important features from each evaluation matrix (which could be model results or analysis outputs) in the given list. 
    It is used to pinpoint key factors influencing wine quality.
    '''
    result_dict = []
    # Iterate through unique dataframes
    for dataframe in evaluation_matrix_list:
        df_subset = dataframe[0].iloc[1:].copy()
        df_subset['Coefficients'] = df_subset['Coefficients'].abs()
        
        # Sort the subset based on coefficients in descending order
        sorted_subset = df_subset.sort_values(by='Coefficients', ascending=False).reset_index(drop=True)

        # Select the top three important features
        top_three_features = sorted_subset['Parameters'].loc[:2:].tolist()   
        # Create a dictionary entry for the current dataframe
        results_df = result_dict.append({dataframe[0]['df'][1] : top_three_features})
        
    return result_dict

## 2. Finding the recipe for the good and poor wine

This section is dedicated to applying the methodologies or processes established earlier to specifically categorize and analyze good and poor quality wines.

In [None]:
# Here, the function to identify top three important features is called with the evaluation matrix list as the argument. 
# The output will be used to understand critical factors in wine quality.
top_three_features_list = identyfy_three_important_features(evaluation_matrix_list=evaluation_matrix_list)

In [None]:
features_to_reverse_white = ['chlorides', 'volatile acidity']
features_to_reverse_red = ['residual sugar', 'chlorides']

def correct_reverse_log_transformation(df, features):
    '''
    This function reverses the logarithmic transformation applied to certain features in the dataframe. 
    This is useful for converting transformed data back to its original scale, often for interpretation or further analysis.    
    '''
    for feature in features:
        if feature in df.columns:
            df[feature] = np.exp(df[feature])
    return df

df_white_good_without_outliers = correct_reverse_log_transformation(df_white_good_without_outliers, features_to_reverse_white)
df_white_poor_without_outliers = correct_reverse_log_transformation(df_white_poor_without_outliers, features_to_reverse_white)

df_red_good_without_outliers = correct_reverse_log_transformation(df_red_good_without_outliers, features_to_reverse_red)
df_red_poor_without_outliers = correct_reverse_log_transformation(df_red_poor_without_outliers, features_to_reverse_red)

In [None]:
def get_receipes(top_three_features_list, df_lists):
    '''
    his function calculates and prints the 'recipes' for different types of wine, based on the top three features identified earlier. 
    It computes the mean and standard deviation for these features, using these to define a range that characterizes each wine type.
    '''
    for df in df_lists:
        pipeline_name = f"Pipeline_{get_wine_str(df)}"
        for model in top_three_features_list:
            if pipeline_name in model:
                feature_list = model[pipeline_name]
                numeric_columns = check_numeric_columns(df)
                numeric_df = df[numeric_columns]

                means = numeric_df[feature_list].mean()
                stds = numeric_df[feature_list].std()

                ranges = {feature: (means[feature] - stds[feature], means[feature] + stds[feature]) for feature in feature_list}

                print(f"\nRecipes of {get_wine_str(df)} wine:")
                for feature, range_vals in ranges.items():
                    print(f"{feature.capitalize()}: {range_vals[0]:.2f} - {range_vals[1]:.2f}")


In [None]:
get_receipes(top_three_features_list, wine_quality_without_outliers_dfs) 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Function to train Random Forest and get feature importances
def get_feature_importances(dataframe):
    # Check and select only numeric columns, excluding 'quality'
    numeric_columns = dataframe.select_dtypes(include=np.number).columns.tolist()
    if 'quality' in numeric_columns:
        numeric_columns.remove('quality')
    X = dataframe[numeric_columns]
    y = dataframe['quality']
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X, y)
    return rf.feature_importances_, numeric_columns

# Load your dataframes here
# df_red_good_without_outliers, df_red_poor_without_outliers, df_white_good_without_outliers, df_white_poor_without_outliers

# Get feature importances and numeric columns
good_red_importances, good_red_columns = get_feature_importances(df_red_good_without_outliers)
poor_red_importances, poor_red_columns = get_feature_importances(df_red_poor_without_outliers)
good_white_importances, good_white_columns = get_feature_importances(df_white_good_without_outliers)
poor_white_importances, poor_white_columns = get_feature_importances(df_white_poor_without_outliers)

# Plotting
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)
axes[0].bar(good_red_columns, good_red_importances)
axes[1].bar(poor_red_columns, poor_red_importances)
axes[2].bar(good_white_columns, good_white_importances)
axes[3].bar(poor_white_columns, poor_white_importances)

# Add titles and sample sizes as annotations
for i, df, columns in zip(range(4), 
                          [df_red_good_without_outliers, df_red_poor_without_outliers, df_white_good_without_outliers, df_white_poor_without_outliers], 
                          [good_red_columns, poor_red_columns, good_white_columns, poor_white_columns]):
    axes[i].set_title(f"{'Good' if i%2 == 0 else 'Poor'} {'Red' if i < 2 else 'White'} Wine")
    axes[i].set_xticks(range(len(columns)))
    axes[i].set_xticklabels(columns, rotation=45, ha="right")
    axes[i].annotate(f"n = {len(df)}", xy=(0.5, 1), xycoords='axes fraction', ha='center')

plt.tight_layout()
plt.show()