In [214]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [215]:
# Load the df_encodedset using Pandas
data = pd.read_csv('data/final_data.csv').drop('Unnamed: 0', axis=1)

In [216]:
def fill_nan_based_on_types(series):
    column_name = series.name  # Get the name of the current column
    if column_name == 'hex_color':
        return series.fillna('000000')  # Fill with hex color code
    # elif series.dtype == 'O':  # Check if dtype is object (text)
    #     return series.fillna('Not Available')  # Placeholder for object dtype
    # elif pd.api.types.is_integer_dtype(series.dtype):  # Check if dtype is integer
    #     return series.astype('float64').fillna(-1000.0)
    # elif pd.api.types.is_float_dtype(series.dtype):  # Check if dtype is integer
    #     return series.fillna(-1000.0)  # Placeholder for integer dtype
    return series

In [217]:
data.beer_style = data.beer_style.str.split(' / ')
data = data.explode('beer_style')
#data.beer_style = data.beer_style.astype(str)
#data.head(15)
data.beer_style = data.beer_style.str.join('')

In [218]:
# Possibility to percentualize values, does not affect ML algo tho

# List of columns to sum
# columns_to_sum = ['Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty']

# df = data
# Sum the values across the specified columns
#df['Total'] = df[columns_to_sum].sum(axis=1)

# Calculate the percentage of each column's total
# for col in columns_to_sum:
#     df[col] = (df[col] / df[col].sum(axis=0))
# Drop the 'Total' column if you don't need it anymore
#df.drop('Total', axis=1, inplace=True)
#df.drop('abv', axis=1, inplace=True)
#df.drop('hex_color', axis=1, inplace=True)

#df = pd.concat([df, df1["hex_color"]], axis=1)

# df = df.rename(columns={'name': 'beer_name'})
# df.rename(columns={df.columns[0]: 'id'}, inplace=True)
# df.columns = df.columns.str.replace(' ', '_')

In [219]:
data.keys()

Index(['name', 'description', 'abv', 'labels', 'hex_color', 'Ave Rating',
       'Brewery', 'ABV', 'Min IBU', 'Max IBU', 'Astringency', 'Body',
       'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy',
       'Spices', 'Malty', 'beer_style'],
      dtype='object')

In [220]:
data = data.apply(fill_nan_based_on_types)
data.dropna(inplace=True)
codes, uniques = pd.factorize(data['beer_style'])
data['beer_style'] = codes

In [221]:
# Create translator to decode factors later
translator = dict()
for key in codes:
    translator[key] = uniques[key]
translator

{0: 'Winter Warmer',
 1: 'Belgian Pale Ale',
 2: 'American Wild Ale',
 3: 'Russian Imperial Stout',
 4: 'Pumpkin Ale',
 5: 'American Amber',
 6: 'Red Ale',
 7: 'English India Pale Ale (IPA)',
 8: 'American IPA',
 9: 'Light Lager',
 10: 'Red Lager',
 11: 'English Pale Mild Ale',
 12: 'Herbed',
 13: 'Spiced Beer',
 14: 'Scottish Ale',
 15: 'American Brown Ale',
 16: 'Dubbel',
 17: 'Saison',
 18: 'Farmhouse Ale',
 19: 'Weizenbock',
 20: 'American Dark Wheat Ale',
 21: 'Witbier',
 22: 'English Brown Ale',
 23: 'Fruit',
 24: 'Vegetable Beer',
 25: 'American Porter',
 26: 'Baltic Porter',
 27: 'Flanders Oud Bruin',
 28: 'Belgian IPA',
 29: 'American Pale Ale (APA)',
 30: 'English Bitter',
 31: 'Rauchbier',
 32: 'Tripel',
 33: 'American Pale Wheat Ale',
 34: 'American Double',
 35: 'Imperial IPA',
 36: 'Bock',
 37: 'Quadrupel (Quad)',
 38: 'Märzen',
 39: 'Oktoberfest',
 40: 'Vienna Lager',
 41: 'Chile Beer',
 42: 'American Barleywine',
 43: 'American Stout',
 44: 'Belgian Strong Dark Ale',
 4

In [222]:
# removed 'abv''ABV',
clean_data = data[['abv', 'hex_color',  'Min IBU', 'Max IBU', 'Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty', 'beer_style']]
clean_data = clean_data.drop_duplicates()

In [223]:
# Filter the DataFrame for rows where no color code available
df_to_apply = clean_data[clean_data['hex_color'] == '000000']


# Filter the DataFrame for rows where color code available
df_complete = clean_data[clean_data['hex_color'] != '000000']

In [224]:
df_complete

Unnamed: 0,abv,hex_color,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
6,7.5,FFD878,5,30,42,29,7,10,53,138,4,79,20,1,20,2
7,7.5,FFD878,5,30,42,29,7,10,53,138,4,79,20,1,20,3
8,10.5,36080A,50,80,4,101,27,62,88,10,1,15,29,9,137,3
16,4.1,FFD878,8,12,16,24,5,9,10,7,2,6,16,1,42,9
17,4.9,D77200,18,30,10,30,8,28,117,5,0,5,36,1,109,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27453,5.1,E58500,5,45,22,23,1,31,73,38,0,74,50,2,28,24
27558,5.4,470606,15,25,10,62,11,41,74,9,0,12,39,2,142,22
27559,9.9,7B1A00,40,100,3,39,38,50,147,24,0,47,59,5,78,76
27566,4.2,3F0708,30,40,23,78,8,56,42,10,0,9,35,8,98,74


#### Observation -> we have 39 Color Classes to train with and 103 beer styles  3635 rows


```python
Colors: ['FFD878', '36080A', 'D77200', 'FFBF42', 'B54C00', 'F8A600',
       '952D00', 'C35900', 'FBB123', 'A13700', 'B04500', 'CB6200',
       'BB5100', '600903', '9B3200', 'DE7C00', '5E0B00', '660D00',
       'CF6900', 'EA8F00', 'E58500', '470606', 'FFE699', '3B0607',
       'F39C00', '7B1A00', '3F0708', 'FFCA5A', '882300', '520907',
       '8E2900', 'A63E00', '5A0A02', '6A0E00', '701400', '440607',
       '771900', '4C0505', '821E00']


print(f'We have {len(df_complete.hex_color.unique())} classes for the training')
df_complete.hex_color.unique()
```

### Now I prepare the training set into training and test data: 

In [225]:
df_complete

Unnamed: 0,abv,hex_color,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
6,7.5,FFD878,5,30,42,29,7,10,53,138,4,79,20,1,20,2
7,7.5,FFD878,5,30,42,29,7,10,53,138,4,79,20,1,20,3
8,10.5,36080A,50,80,4,101,27,62,88,10,1,15,29,9,137,3
16,4.1,FFD878,8,12,16,24,5,9,10,7,2,6,16,1,42,9
17,4.9,D77200,18,30,10,30,8,28,117,5,0,5,36,1,109,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27453,5.1,E58500,5,45,22,23,1,31,73,38,0,74,50,2,28,24
27558,5.4,470606,15,25,10,62,11,41,74,9,0,12,39,2,142,22
27559,9.9,7B1A00,40,100,3,39,38,50,147,24,0,47,59,5,78,76
27566,4.2,3F0708,30,40,23,78,8,56,42,10,0,9,35,8,98,74


## Tree

In [226]:
# removed 'abv' 'ABV',
X = df_complete[['abv',  'Min IBU', 'Max IBU', 'Astringency', 'Body',
       'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy',
       'Spices', 'Malty', 'beer_style']]

y = df_complete['hex_color']

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest Regressor
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)


print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

      36080A       0.93      1.00      0.96        27
      3B0607       0.89      0.89      0.89         9
      3F0708       0.00      0.00      0.00         1
      470606       1.00      0.98      0.99        42
      520907       1.00      0.80      0.89        10
      5A0A02       1.00      0.50      0.67         2
      5E0B00       0.67      1.00      0.80         8
      600903       0.11      0.13      0.12        23
      660D00       1.00      1.00      1.00         1
      6A0E00       1.00      1.00      1.00         1
      771900       1.00      1.00      1.00         2
      7B1A00       1.00      0.25      0.40         4
      882300       0.77      0.67      0.71        15
      8E2900       0.82      0.78      0.80        18
      952D00       0.81      0.74      0.77        34
      9B3200       1.00      1.00      1.00        21
      A13700       1.00      1.00      1.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [227]:
# Make predictions on the testing set
pred_data = rf_model.predict(df_to_apply.drop('hex_color', axis=1))

df_final = df_to_apply.drop('hex_color', axis=1)
#df_final = pred_data
df_final.insert(1, "hex_color", pred_data)
df_final.beer_style = df_final.beer_style.map(translator)
df_complete.beer_style = df_complete.beer_style.map(translator)
X_train.beer_style = X_train.beer_style.map(translator)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_complete.beer_style = df_complete.beer_style.map(translator)


In [228]:
df_complete

Unnamed: 0,abv,hex_color,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,beer_style
6,7.5,FFD878,5,30,42,29,7,10,53,138,4,79,20,1,20,American Wild Ale
7,7.5,FFD878,5,30,42,29,7,10,53,138,4,79,20,1,20,Russian Imperial Stout
8,10.5,36080A,50,80,4,101,27,62,88,10,1,15,29,9,137,Russian Imperial Stout
16,4.1,FFD878,8,12,16,24,5,9,10,7,2,6,16,1,42,Light Lager
17,4.9,D77200,18,30,10,30,8,28,117,5,0,5,36,1,109,American Amber
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27453,5.1,E58500,5,45,22,23,1,31,73,38,0,74,50,2,28,Vegetable Beer
27558,5.4,470606,15,25,10,62,11,41,74,9,0,12,39,2,142,English Brown Ale
27559,9.9,7B1A00,40,100,3,39,38,50,147,24,0,47,59,5,78,American Strong Ale
27566,4.2,3F0708,30,40,23,78,8,56,42,10,0,9,35,8,98,Irish Dry Stout


In [229]:

# append the new row to the original DataFrame
df = pd.concat([df_final, df_complete, X_train], ignore_index=True)

In [230]:
df.to_csv('./data/full_data_with_colors.csv')

In [14]:
df_final.to_csv('./data/dataset_with_colors.csv')