In [126]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder # For creating labels for unimportant data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
import category_encoders as ce # For categorizing important string values into numeric values

In [198]:
# Load the df_encodedset using Pandas
data = pd.read_csv('data/final_data.csv')

In [199]:
def fill_nan_based_on_types(series):
    column_name = series.name  # Get the name of the current column
    if column_name == 'hex_color':
        return series.fillna('000000')  # Fill with hex color code
    elif series.dtype == 'O':  # Check if dtype is object (text)
        return series.fillna('Not Available')  # Placeholder for object dtype
    elif pd.api.types.is_integer_dtype(series.dtype):  # Check if dtype is integer
        return series.astype('float64').fillna(-1000.0)
    elif pd.api.types.is_float_dtype(series.dtype):  # Check if dtype is integer
        return series.fillna(-1000.0)  # Placeholder for integer dtype
    return series

In [200]:
data = data.apply(fill_nan_based_on_types)
data['beer_style'] = pd.factorize(data['beer_style'])[0]

In [201]:
clean_data = data[['abv', 'hex_color', 'ABV', 'Min IBU', 'Max IBU', 'Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty', 'beer_style']]

In [202]:
# Filter the DataFrame for rows where no color code available
df_to_apply = clean_data[clean_data['hex_color'] == '000000']


# Filter the DataFrame for rows where color code available
df_complete = clean_data[clean_data['hex_color'] != '000000']

#### Observation -> we have 39 Color Classes to train with and 89 beer styles


```python
Colors: ['FFD878', '36080A', 'D77200', 'FFBF42', 'B54C00', 'F8A600',
       '952D00', 'C35900', 'FBB123', 'A13700', 'B04500', 'CB6200',
       'BB5100', '600903', '9B3200', 'DE7C00', '5E0B00', '660D00',
       'CF6900', 'EA8F00', 'E58500', '470606', 'FFE699', '3B0607',
       'F39C00', '7B1A00', '3F0708', 'FFCA5A', '882300', '520907',
       '8E2900', 'A63E00', '5A0A02', '6A0E00', '701400', '440607',
       '771900', '4C0505', '821E00']


print(f'We have {len(df_complete.hex_color.unique())} classes for the training')
df_complete.hex_color.unique()
```

### Now I prepare the training set into training and test data: 

## Tree

In [205]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



X = df_complete[['abv', 'ABV', 'Min IBU', 'Max IBU', 'Astringency', 'Body',
       'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy',
       'Spices', 'Malty', 'beer_style']]

y = df_complete['hex_color']

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest Regressor
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)


print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)

Accuracy: 0.70
Classification Report:
              precision    recall  f1-score   support

      36080A       0.81      0.96      0.88        23
      3B0607       0.79      1.00      0.88        11
      3F0708       0.33      0.33      0.33         3
      440607       0.00      0.00      0.00         1
      470606       1.00      1.00      1.00        27
      520907       1.00      1.00      1.00         2
      5A0A02       0.00      0.00      0.00         1
      5E0B00       1.00      1.00      1.00         8
      600903       0.08      0.04      0.05        25
      660D00       0.00      0.00      0.00         2
      701400       1.00      0.50      0.67         2
      771900       0.50      1.00      0.67         1
      7B1A00       0.00      0.00      0.00         4
      882300       0.50      0.64      0.56        11
      8E2900       0.71      0.77      0.74        13
      952D00       1.00      0.67      0.80        21
      9B3200       1.00      0.94      0.97

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [206]:
# Make predictions on the testing set
pred_data = rf_model.predict(df_to_apply.drop('hex_color', axis=1))

df_final = df_to_apply.drop('hex_color', axis=1)
#df_final = pred_data
df_final.insert(1, "hex_color", pred_data)

In [212]:
df_final.beer_style

0         0
1         0
2         1
3         2
4         3
         ..
27561     3
27562    18
27563    66
27564    32
27565    17
Name: beer_style, Length: 23862, dtype: int64

In [193]:
df_final.to_csv('./data/dataset_with_colors.csv')

## TRASH

In [8]:
# Define a simple neural network model using PyTorch
class HexColorPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(HexColorPredictor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim) # first fully connected (linear) layer
        self.relu = nn.ReLU() # Rectified Linear Unit (ReLU) activation function, applies element-wise rectification to its input
        self.fc2 = nn.Linear(hidden_dim, output_dim) # second fully connected (linear) layer, works with output of first layer

    def forward(self, x):
        '''
        * Defines the forward pass of the neural network. 
        * Specifies how the input data x is processed through the layers defined in the constructor
        * 1. Input passed through 1st FFL
        * 2. Output of previous operation passed through ReLU
        * 3. Output of previous operation passed through 2nd FFL
        '''
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [9]:
# Initialize the model
input_dim = X_train.shape[1]
hidden_dim = 64
output_dim = 7  # Hex color is represented by 7 characters
model = HexColorPredictor(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)