In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the df_encodedset using Pandas
data = pd.read_csv('data/final_data.csv')

In [4]:
def fill_nan_based_on_types(series):
    column_name = series.name  # Get the name of the current column
    if column_name == 'hex_color':
        return series.fillna('000000')  # Fill with hex color code
    # elif series.dtype == 'O':  # Check if dtype is object (text)
    #     return series.fillna('Not Available')  # Placeholder for object dtype
    # elif pd.api.types.is_integer_dtype(series.dtype):  # Check if dtype is integer
    #     return series.astype('float64').fillna(-1000.0)
    # elif pd.api.types.is_float_dtype(series.dtype):  # Check if dtype is integer
    #     return series.fillna(-1000.0)  # Placeholder for integer dtype
    return series

In [5]:
data = data.apply(fill_nan_based_on_types)
data.dropna(inplace=True)
codes, uniques = pd.factorize(data['beer_style'])
data['beer_style'] = codes

In [6]:
# Create translator to decode factors later
translator = dict()
for key in codes:
    translator[key] = uniques[key]
translator

{0: 'Winter Warmer',
 1: 'Belgian Pale Ale',
 2: 'American Wild Ale',
 3: 'Russian Imperial Stout',
 4: 'Pumpkin Ale',
 5: 'American Amber / Red Ale',
 6: 'English India Pale Ale (IPA)',
 7: 'American IPA',
 8: 'Light Lager',
 9: 'American Amber / Red Lager',
 10: 'English Pale Mild Ale',
 11: 'Herbed / Spiced Beer',
 12: 'Scottish Ale',
 13: 'American Brown Ale',
 14: 'Dubbel',
 15: 'Saison / Farmhouse Ale',
 16: 'Weizenbock',
 17: 'American Dark Wheat Ale',
 18: 'Witbier',
 19: 'English Brown Ale',
 20: 'Fruit / Vegetable Beer',
 21: 'American Porter',
 22: 'Baltic Porter',
 23: 'Flanders Oud Bruin',
 24: 'Belgian IPA',
 25: 'American Pale Ale (APA)',
 26: 'English Bitter',
 27: 'Rauchbier',
 28: 'Tripel',
 29: 'American Pale Wheat Ale',
 30: 'American Double / Imperial IPA',
 31: 'Bock',
 32: 'Quadrupel (Quad)',
 33: 'Märzen / Oktoberfest',
 34: 'Vienna Lager',
 35: 'Chile Beer',
 36: 'American Barleywine',
 37: 'American Stout',
 38: 'Belgian Strong Dark Ale',
 39: 'American Black 

In [7]:
clean_data = data[['abv', 'hex_color', 'ABV', 'Min IBU', 'Max IBU', 'Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty', 'beer_style']]

In [8]:
# Filter the DataFrame for rows where no color code available
df_to_apply = clean_data[clean_data['hex_color'] == '000000']


# Filter the DataFrame for rows where color code available
df_complete = clean_data[clean_data['hex_color'] != '000000']

#### Observation -> we have 39 Color Classes to train with and 89 beer styles


```python
Colors: ['FFD878', '36080A', 'D77200', 'FFBF42', 'B54C00', 'F8A600',
       '952D00', 'C35900', 'FBB123', 'A13700', 'B04500', 'CB6200',
       'BB5100', '600903', '9B3200', 'DE7C00', '5E0B00', '660D00',
       'CF6900', 'EA8F00', 'E58500', '470606', 'FFE699', '3B0607',
       'F39C00', '7B1A00', '3F0708', 'FFCA5A', '882300', '520907',
       '8E2900', 'A63E00', '5A0A02', '6A0E00', '701400', '440607',
       '771900', '4C0505', '821E00']


print(f'We have {len(df_complete.hex_color.unique())} classes for the training')
df_complete.hex_color.unique()
```

### Now I prepare the training set into training and test data: 

## Tree

In [9]:
X = df_complete[['abv', 'ABV', 'Min IBU', 'Max IBU', 'Astringency', 'Body',
       'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy',
       'Spices', 'Malty', 'beer_style']]

y = df_complete['hex_color']

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest Regressor
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)


print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)

Accuracy: 0.69
Classification Report:
              precision    recall  f1-score   support

      36080A       0.85      0.88      0.86        25
      3B0607       0.75      0.86      0.80         7
      3F0708       1.00      1.00      1.00         1
      470606       0.93      0.96      0.95        27
      4C0505       0.00      0.00      0.00         1
      520907       0.75      1.00      0.86         6
      5A0A02       0.00      0.00      0.00         1
      5E0B00       0.75      1.00      0.86         6
      600903       0.07      0.04      0.05        23
      660D00       0.00      0.00      0.00         1
      6A0E00       0.00      0.00      0.00         2
      701400       0.00      0.00      0.00         0
      771900       1.00      1.00      1.00         4
      7B1A00       0.00      0.00      0.00         1
      882300       0.67      0.50      0.57        16
      8E2900       1.00      0.86      0.92        14
      952D00       0.76      0.73      0.74

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# Make predictions on the testing set
pred_data = rf_model.predict(df_to_apply.drop('hex_color', axis=1))

df_final = df_to_apply.drop('hex_color', axis=1)
#df_final = pred_data
df_final.insert(1, "hex_color", pred_data)
df_final.beer_style = df_final.beer_style.map(translator)
df_complete.beer_style = df_complete.beer_style.map(translator)
X_train.beer_style = X_train.beer_style.map(translator)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_complete.beer_style = df_complete.beer_style.map(translator)


In [19]:
df_complete

Unnamed: 0,abv,hex_color,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
6,7.5,FFD878,7.50,5,30,42,29,7,10,53,138,4,79,20,1,20,American Wild Ale
7,7.5,FFD878,7.50,5,30,42,29,7,10,53,138,4,79,20,1,20,Russian Imperial Stout
8,10.5,36080A,10.50,50,80,4,101,27,62,88,10,1,15,29,9,137,Russian Imperial Stout
16,4.1,FFD878,4.10,8,12,16,24,5,9,10,7,2,6,16,1,42,Light Lager
17,4.9,D77200,4.90,18,30,10,30,8,28,117,5,0,5,36,1,109,American Amber / Red Lager
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27453,5.1,E58500,5.10,5,45,22,23,1,31,73,38,0,74,50,2,28,Fruit / Vegetable Beer
27558,5.4,470606,5.15,15,25,10,62,11,41,74,9,0,12,39,2,142,English Brown Ale
27559,9.9,7B1A00,10.00,40,100,3,39,38,50,147,24,0,47,59,5,78,American Strong Ale
27566,4.2,3F0708,5.00,30,40,23,78,8,56,42,10,0,9,35,8,98,Irish Dry Stout


In [20]:

# append the new row to the original DataFrame
df = pd.concat([df_final, df_complete, X_train], ignore_index=True)

In [21]:
df.to_csv('./data/full_data_with_colors.csv')

In [14]:
df_final.to_csv('./data/dataset_with_colors.csv')