In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the df_encodedset using Pandas
data = pd.read_csv('data/final_data.csv').drop('Unnamed: 0', axis=1)

In [3]:
def fill_nan_based_on_types(series):
    column_name = series.name  # Get the name of the current column
    if column_name == 'hex_color':
        return series.fillna('000000')  # Fill with hex color code
    return series

In [4]:
data.beer_style = data.beer_style.str.split(' / ')
data = data.explode('beer_style')
#data.beer_style = data.beer_style.astype(str)
#data.head(15)
data.beer_style = data.beer_style.str.join('')

In [5]:
# Possibility to percentualize values, does not affect ML algo tho

# List of columns to sum
# columns_to_sum = ['Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty']

# df = data
# Sum the values across the specified columns
#df['Total'] = df[columns_to_sum].sum(axis=1)

# Calculate the percentage of each column's total
# for col in columns_to_sum:
#     df[col] = (df[col] / df[col].sum(axis=0))
# Drop the 'Total' column if you don't need it anymore
#df.drop('Total', axis=1, inplace=True)
#df.drop('abv', axis=1, inplace=True)
#df.drop('hex_color', axis=1, inplace=True)

#df = pd.concat([df, df1["hex_color"]], axis=1)

# df = df.rename(columns={'name': 'beer_name'})
# df.rename(columns={df.columns[0]: 'id'}, inplace=True)
# df.columns = df.columns.str.replace(' ', '_')

In [6]:
data.keys()

Index(['name', 'description', 'abv', 'labels', 'hex_color', 'Ave Rating',
       'Brewery', 'ABV', 'Min IBU', 'Max IBU', 'Astringency', 'Body',
       'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy',
       'Spices', 'Malty', 'beer_style'],
      dtype='object')

In [7]:
data = data.apply(fill_nan_based_on_types)
data.dropna(inplace=True)
codes, uniques = pd.factorize(data['beer_style'])
data['beer_style'] = codes

In [8]:
# Create translator to decode factors later
translator = dict()
for key in codes:
    translator[key] = uniques[key]
translator

{0: 'Winter Warmer',
 1: 'Belgian Pale Ale',
 2: 'American Wild Ale',
 3: 'Russian Imperial Stout',
 4: 'Pumpkin Ale',
 5: 'American Amber',
 6: 'Red Ale',
 7: 'English India Pale Ale (IPA)',
 8: 'American IPA',
 9: 'Light Lager',
 10: 'Red Lager',
 11: 'English Pale Mild Ale',
 12: 'Herbed',
 13: 'Spiced Beer',
 14: 'Scottish Ale',
 15: 'American Brown Ale',
 16: 'Dubbel',
 17: 'Saison',
 18: 'Farmhouse Ale',
 19: 'Weizenbock',
 20: 'American Dark Wheat Ale',
 21: 'Witbier',
 22: 'English Brown Ale',
 23: 'Fruit',
 24: 'Vegetable Beer',
 25: 'American Porter',
 26: 'Baltic Porter',
 27: 'Flanders Oud Bruin',
 28: 'Belgian IPA',
 29: 'American Pale Ale (APA)',
 30: 'English Bitter',
 31: 'Rauchbier',
 32: 'Tripel',
 33: 'American Pale Wheat Ale',
 34: 'American Double',
 35: 'Imperial IPA',
 36: 'Bock',
 37: 'Quadrupel (Quad)',
 38: 'Märzen',
 39: 'Oktoberfest',
 40: 'Vienna Lager',
 41: 'Chile Beer',
 42: 'American Barleywine',
 43: 'American Stout',
 44: 'Belgian Strong Dark Ale',
 4

In [9]:
# removed 'abv''ABV','Astringency', 'Body', 'Alcohol',
clean_data = data[['ABV', 'hex_color',  'Min IBU', 'Max IBU', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty', 'beer_style']]
clean_data = clean_data.drop_duplicates()

In [10]:
# Filter the DataFrame for rows where no color code available
df_to_apply = clean_data[clean_data['hex_color'] == '000000']


# Filter the DataFrame for rows where color code available
df_complete = clean_data[clean_data['hex_color'] != '000000']

In [11]:
df_complete.drop_duplicates()

Unnamed: 0,ABV,hex_color,Min IBU,Max IBU,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
6,7.50,FFD878,5,30,10,53,138,4,79,20,1,20,2
7,7.50,FFD878,5,30,10,53,138,4,79,20,1,20,3
8,10.50,36080A,50,80,62,88,10,1,15,29,9,137,3
16,4.10,FFD878,8,12,9,10,7,2,6,16,1,42,9
17,4.90,D77200,18,30,28,117,5,0,5,36,1,109,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27453,5.10,E58500,5,45,31,73,38,0,74,50,2,28,24
27558,5.15,470606,15,25,41,74,9,0,12,39,2,142,22
27559,10.00,7B1A00,40,100,50,147,24,0,47,59,5,78,76
27566,5.00,3F0708,30,40,56,42,10,0,9,35,8,98,74


In [12]:
df_to_apply

Unnamed: 0,ABV,hex_color,Min IBU,Max IBU,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
0,5.0,000000,35,50,28,73,19,0,19,32,43,100,0
4,6.2,000000,20,30,25,34,47,2,45,34,41,24,1
10,7.0,000000,5,70,37,70,20,1,91,24,79,93,4
11,6.8,000000,25,45,71,55,34,0,45,94,16,61,5
11,6.8,000000,25,45,71,55,34,0,45,94,16,61,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27534,4.6,000000,5,45,7,79,17,0,67,10,1,24,24
27535,4.6,000000,5,45,7,79,17,0,67,10,1,24,58
27536,4.6,000000,5,45,7,79,17,0,67,10,1,24,29
27537,4.6,000000,5,45,7,79,17,0,67,10,1,24,33


#### Observation -> we have 39 Color Classes to train with and 103 beer styles  3635 rows


```python
Colors: ['FFD878', '36080A', 'D77200', 'FFBF42', 'B54C00', 'F8A600',
       '952D00', 'C35900', 'FBB123', 'A13700', 'B04500', 'CB6200',
       'BB5100', '600903', '9B3200', 'DE7C00', '5E0B00', '660D00',
       'CF6900', 'EA8F00', 'E58500', '470606', 'FFE699', '3B0607',
       'F39C00', '7B1A00', '3F0708', 'FFCA5A', '882300', '520907',
       '8E2900', 'A63E00', '5A0A02', '6A0E00', '701400', '440607',
       '771900', '4C0505', '821E00']


print(f'We have {len(df_complete.hex_color.unique())} classes for the training')
df_complete.hex_color.unique()
```

### Now I prepare the training set into training and test data: 

In [13]:
df_complete

Unnamed: 0,ABV,hex_color,Min IBU,Max IBU,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
6,7.50,FFD878,5,30,10,53,138,4,79,20,1,20,2
7,7.50,FFD878,5,30,10,53,138,4,79,20,1,20,3
8,10.50,36080A,50,80,62,88,10,1,15,29,9,137,3
16,4.10,FFD878,8,12,9,10,7,2,6,16,1,42,9
17,4.90,D77200,18,30,28,117,5,0,5,36,1,109,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27453,5.10,E58500,5,45,31,73,38,0,74,50,2,28,24
27558,5.15,470606,15,25,41,74,9,0,12,39,2,142,22
27559,10.00,7B1A00,40,100,50,147,24,0,47,59,5,78,76
27566,5.00,3F0708,30,40,56,42,10,0,9,35,8,98,74


In [14]:
df_complete.hex_color.value_counts(normalize=True)*100

hex_color
CF6900    9.023384
FFCA5A    7.867950
F8A600    7.757909
B54C00    5.749656
CB6200    5.006878
FFBF42    4.979367
DE7C00    4.924347
470606    4.759285
952D00    4.154058
B04500    3.933975
F39C00    3.851444
BB5100    3.658872
600903    3.218707
FBB123    3.108666
9B3200    3.108666
E58500    2.998624
A13700    2.943604
36080A    2.751032
8E2900    2.310867
D77200    2.035763
882300    1.815681
5E0B00    1.595598
FFD878    1.513067
3B0607    1.210454
EA8F00    1.100413
520907    1.072902
C35900    0.770289
A63E00    0.605227
771900    0.412655
7B1A00    0.385144
FFE699    0.302613
3F0708    0.247593
5A0A02    0.247593
701400    0.165062
660D00    0.137552
6A0E00    0.110041
440607    0.082531
821E00    0.055021
4C0505    0.027510
Name: proportion, dtype: float64

## Tree

In [15]:
# removed 'abv' 'ABV', 'Astringency', 'Body','Alcohol',
X = df_complete[['ABV',  'Min IBU', 'Max IBU', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy',
       'Spices', 'Malty', 'beer_style']]

y = df_complete['hex_color']

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest Regressor
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)


print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)

Accuracy: 0.13
Classification Report:
              precision    recall  f1-score   support

      36080A       0.67      0.70      0.68        20
      3B0607       0.12      0.17      0.14         6
      3F0708       0.00      0.00      0.00         4
      440607       0.00      0.00      0.00         1
      470606       0.19      0.24      0.21        33
      4C0505       0.00      0.00      0.00         1
      520907       0.00      0.00      0.00         4
      5A0A02       1.00      0.33      0.50         3
      5E0B00       0.25      0.36      0.30        11
      600903       0.11      0.17      0.13        24
      660D00       1.00      0.50      0.67         2
      6A0E00       0.00      0.00      0.00         1
      701400       0.00      0.00      0.00         1
      771900       0.00      0.00      0.00         1
      7B1A00       0.00      0.00      0.00         1
      882300       0.07      0.12      0.09         8
      8E2900       0.65      0.79      0.71

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Make predictions on the testing set
pred_data = rf_model.predict(df_to_apply.drop('hex_color', axis=1))

df_final = df_to_apply.drop('hex_color', axis=1)
#df_final = pred_data
df_final.insert(1, "hex_color", pred_data)
df_final.beer_style = df_final.beer_style.map(translator)
df_complete.beer_style = df_complete.beer_style.map(translator)
X_train.beer_style = X_train.beer_style.map(translator)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_complete.beer_style = df_complete.beer_style.map(translator)


In [17]:
df_complete

Unnamed: 0,ABV,hex_color,Min IBU,Max IBU,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
6,7.50,FFD878,5,30,10,53,138,4,79,20,1,20,American Wild Ale
7,7.50,FFD878,5,30,10,53,138,4,79,20,1,20,Russian Imperial Stout
8,10.50,36080A,50,80,62,88,10,1,15,29,9,137,Russian Imperial Stout
16,4.10,FFD878,8,12,9,10,7,2,6,16,1,42,Light Lager
17,4.90,D77200,18,30,28,117,5,0,5,36,1,109,American Amber
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27453,5.10,E58500,5,45,31,73,38,0,74,50,2,28,Vegetable Beer
27558,5.15,470606,15,25,41,74,9,0,12,39,2,142,English Brown Ale
27559,10.00,7B1A00,40,100,50,147,24,0,47,59,5,78,American Strong Ale
27566,5.00,3F0708,30,40,56,42,10,0,9,35,8,98,Irish Dry Stout


In [18]:

# append the new row to the original DataFrame
df = pd.concat([df_final, df_complete, X_train], ignore_index=True)

In [19]:
#df.to_csv('./data/full_data_with_colors.csv')

In [20]:
#df_final.to_csv('./data/dataset_with_colors.csv')