In [1]:
import os
if os.path.basename(os.getcwd()) != 'food-pairing':
    os.chdir(os.path.dirname(os.getcwd()))

from collections import Counter
from more_itertools import collapse
import plotly.express as px
import plotly.io as pio

from utils.data_loading import read_foods, read_molecules
from utils.ml_utils import find_n_neighbours, molecules2vec

## Molecules histograms
___

In [2]:
food_df = read_foods()

def calculate_list_length(lst):
    return len(lst)

In [3]:
def check_lengths(df):
    df['list_length'] = df['foodb_ids'].apply(calculate_list_length)
    min_length = df['list_length'].min()
    max_length = df['list_length'].max()
    average_length = df['list_length'].mean()

    print("Minimum length:", min_length)
    print("Maximum length:", max_length)
    print("Average length:", average_length)

In [4]:
check_lengths(food_df)

Minimum length: 11
Maximum length: 382
Average length: 108.77574171029669


In [5]:
molecules = collapse(food_df['foodb_ids'].values.tolist())
c = Counter(molecules)

In [45]:
single_elements = [ k for k, v in c.items() if v < 5]
len(c)

889

In [48]:
def plot_histogram(df):
    molecules = collapse(df['foodb_ids'].values.tolist())
    c = Counter(molecules)
    c = {k: v for k, v in sorted(c.items(), key=lambda item: item[1], reverse=True)}
    fig = px.bar(
        # x=list(c.keys()),
        x = range(1, len(list(c.keys()))+1),
        y = list(c.values()),
        labels = {
            'x': "Molecules",
            'y': "Number of occurences"
        }, 
    # nbins=50,
    log_y=True
    ).update_xaxes(categoryorder='total descending')
    fig.update_layout(
            font=dict(
                family="CMU Serif",
                size=14, 
            )
        )
        
    fig.update_layout( 
        template = 'ggplot2', 
        height=500,
        width = 900,
        margin=dict(l=20, r=20, t=20, b=20),
        bargap=0,
        barmode='group',
        bargroupgap=0
    )
    fig.update_traces(marker_color='#EF553B',marker_line_width = 0, selector=dict(type="bar"))
    fig.update_layout(yaxis_title="Number of occurences")
    fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tick0 = 0,
        dtick = 50,
        ticktext = [50*i for i in range(int(780/50))]
        )
    )
    # fig.update_xaxes(showticklabels=False) 
    config = {
    'toImageButtonOptions': {
        'format': 'png', # one of png, svg, jpeg, webp
        'height': 400,
        'width': 600,
        'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
    }
    }
    
    fig.show(config=config)
    pio.write_image(fig, f"images/molecules_occurences.png", scale=6, width=800, height=500)

In [49]:
plot_histogram(food_df)

In [9]:
len(single_elements)

0

In [10]:
single_elements

[]

In [11]:
filtered_df = food_df[food_df['molecules'].apply(len) >= 5]

len(filtered_df)

950

In [12]:
single_elements = filtered_df['molecules'].explode().unique().tolist()
single_elements

[768,
 8452,
 644104,
 527,
 8723,
 31260,
 798,
 5284639,
 11552,
 15394,
 6184,
 2345,
 439341,
 5365811,
 5315892,
 638011,
 61503,
 8768,
 323,
 638278,
 637511,
 6986,
 26447,
 637775,
 1110,
 5280598,
 61020,
 5280863,
 1889,
 643941,
 637542,
 6251,
 107,
 878,
 7284,
 7288,
 11128,
 637566,
 126,
 650,
 33931,
 441484,
 445070,
 5284503,
 8857,
 5318042,
 1183,
 6561,
 6050,
 6054,
 7847,
 5281708,
 441005,
 7858,
 4788,
 180,
 6072,
 853433,
 5280443,
 5363388,
 5280445,
 79803,
 8130,
 643779,
 107971,
 454,
 18635,
 439246,
 10448,
 439263,
 7654,
 998,
 439533,
 7150,
 240,
 244,
 247,
 89594,
 5280511,
 9064,
 1130,
 8655,
 8468,
 6202,
 8094,
 1224,
 8425,
 2257,
 12743,
 13248,
 474,
 11859,
 19934,
 19148,
 17407,
 16342,
 15473,
 15467,
 14944,
 14898,
 14664,
 14518,
 14503,
 14476,
 13762,
 13258,
 12847,
 12807,
 12681,
 12570,
 12538,
 12253,
 12241,
 12174,
 12086,
 12044,
 12011,
 11907,
 11889,
 11679,
 9021,
 8746,
 8740,
 8305,
 8298,
 8135,
 8125,
 8086,
 806

## Categories 
___

In [13]:
food_df = food_df.dropna(subset='category')

In [14]:
food_df['category'].unique()

array(['seafood', 'fruit', 'seasoning', 'nut', 'vegetable', 'bean',
       'fish', 'herb', 'meat', 'alcohol', 'grain', 'beverage',
       'bakery product', 'dairy', 'sugar', 'mushroom'], dtype=object)

In [15]:
color_dict ={
    'seafood':'rgb(226,154,134)',#shrimp_color
    'fruit':'rgb(115, 207, 105)', #purple
    'seasoning':'rgb(215, 247, 91)', #piss_yellow
    'nut':'rgb(92,61,14)', #brownish
    'vegetable':'rgb(67, 140, 59)',#green
    'bean':'rgb(54, 66, 17)', #dark_green
    'sugar':'rgb(222, 213, 191)', #white
    'fish':'rgb(34, 124, 157)',#blue
    # 'nan':'black', #black
    'herb':'rgb(107, 194, 158)',#turquiseish
    'dairy':'rgb(255, 164, 0)', #dark yellow
    'meat':'rgb(177,35,35)', #red
    'alcohol':'rgb(194, 121, 52)',
    'grain':'rgb(255, 203, 119)',#idk
    'beverage':'rgb(203, 203, 212)', #pink
    'bakery product':'rgb(253, 240, 213) ',#offwhite
    'mushroom':'rgb(250, 207, 173)' #no idea
}

In [16]:
from collections import OrderedDict

In [17]:
import plotly.graph_objects as go

def plotly_pie(features_dict, color_dict, name = "Food categories"):
    features_dict = OrderedDict(sorted(features_dict.items(), key=lambda t: t[0]))
    cat_color = [color_dict[key] for key in features_dict.keys()]
    fig = go.Figure()
    
    fig.add_trace(
        go.Pie(
            labels=list(features_dict.keys()),
            values=list(features_dict.values()),
            direction ='clockwise',
            sort=True,
            marker=dict(colors=cat_color)
        ))
    
    fig.update_layout(
        font=dict(
            family="CMU Serif",
            size=16,  # Set the font size here
        )
    )
    fig.update_layout(
        width=600,
        height=500,
        # title=f"<b>{name}</b>",
        # title_x=0.5,
        # title_y = 0.85,
        margin=dict(l=20, r=20, b=0)
    )
    fig.update_traces(
        textposition='inside', 
        textinfo='percent',
       # marker=dict(colors=colors, line=dict(color='#000000', width=1))
    )
    fig.update_layout(legend=dict(
        orientation='v',
        yanchor='middle',
        xanchor='left',
        y=0.5,
        #x=0.5
    ))
    
    config = {
    'toImageButtonOptions': {
        'format': 'png', # one of png, svg, jpeg, webp
        'height': 600,
        'width': 700,
        'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
    }
    }
    
    fig.show(config=config)
    pio.write_image(fig, f"images/{name}.png", scale=6, width=700,height=600)

In [18]:
categories = collapse(food_df['category'].values.tolist())
cat_counts = Counter(categories)

plotly_pie(cat_counts, color_dict)

In [19]:
cat_counts

Counter({'fruit': 201,
         'vegetable': 178,
         'fish': 117,
         'seasoning': 106,
         'herb': 82,
         'dairy': 65,
         'meat': 57,
         'bakery product': 49,
         'seafood': 48,
         'grain': 45,
         'alcohol': 42,
         'bean': 37,
         'nut': 30,
         'sugar': 27,
         'beverage': 23,
         'mushroom': 13})

In [20]:
def plotly_from_dict(cat_dict, name):
    food_no = 1146
    occurences_dict = dict(sorted(cat_dict.items(), key=lambda x: x[1], reverse=True))
    for key, value in occurences_dict.items():
        occurences_dict[key] = value/float(food_no)

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=list(occurences_dict.keys()),
            y=list(occurences_dict.values()),
        ))

    fig.update_layout(
        font=dict(
            family="CMU Serif",
            size=16,
        )
    )

    fig.update_layout(
        autosize=False,
        width=800,
        height=500,
        # title=f"<b>{name}</b>",
        yaxis_title="<b>\u0025 of dataset</b>",
        yaxis_tickformat="2%",
    )

    # fig.update_traces(marker_color = "rgb(129, 129, 129)")
    fig.update_layout(
        template = 'ggplot2',
        bargap=0.3,
        xaxis_title=None,
        xaxis_tickmode ='linear',
        title_x=0.5
        )
    fig.show()
    pio.write_image(fig, f"images/{name}.png", scale=6, width=800, height=500)

In [21]:
plotly_from_dict(cat_counts, 'Categories')

## Similarity plots
___

In [53]:
import pandas as pd
FOODS = ['tomato', 'onion', 'cinnamon', 'pepper']
METHODS = ['Cooccurences', 'Nearest neighbors', 'Panther', 'node2vec']

In [54]:
def plot_line(food, method):
    target_df = pd.read_csv(f"results/{food}_pairings.csv", index_col=None)
    similarities = target_df[f"{method} Similarity"].values.tolist()
    indexes = range(0, len(similarities)) 
    fig = px.line(x = indexes, y = similarities,
              labels={'x':'entry',
                'y': 'similarity',
                },)
    fig.update_layout(title=f"{method}", height=400, width=700, showlegend=False, font=dict(family="CMU Serif",size=14))
    config = {
    'toImageButtonOptions': {
        'format': 'png', # one of png, svg, jpeg, webp
        'height': 600, 'width': 900, 'scale':6
    }}
    
    fig.show(config=config)
    # pio.write_image(fig, f"images/{name}.png", scale=6, width=900, height=500)

In [55]:
for method in METHODS:
    plot_line('tomato', method)