In [30]:
# import public things

# general / random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipynb
import re # for string parsing / editing
import string # for string parsing / editing
from datetime import datetime
import time
import random
from pathlib import Path
import os
import ast

# for html
import requests # for getting html off the web
from bs4 import BeautifulSoup # for parsing html
import json

# for ML
from wordcloud import WordCloud, STOPWORDS
import snowballstemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# import functions from my functions file
import ipynb.fs.full.functions as funcs

# update a module if it's been edited
# (this is just going around a jupyter feature where simply re-importing doesn't do anything)
# https://support.enthought.com/hc/en-us/articles/204469240-Jupyter-IPython-After-editing-a-module-changes-are-not-effective-without-kernel-restart
import importlib
importlib.reload(funcs)

<module 'ipynb.fs.full.functions' (/home/bkotryna/ML_practice/allrecipes_project/functions.ipynb)>

## directly from Kenny

In [15]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

data = np.array([
        [4.5, 'Mix eggs with sugar'],
        [3.0, 'Pour water over the broccoli'],
        [4.8, 'Sugar goes well with eggs'],
        [2.8, 'Broccoli soup is made with water and broccoli']
        ])

STOP_WORDS = ['with', 'the', 'and', 'is']

vectorizer = CountVectorizer(stop_words=STOP_WORDS)
vectors = vectorizer.fit_transform(data[:,1]).todense()

# Print vocab items with their frequencies, sorted in descending order by frequency
word_and_frequency_tuples = []
for word, index in vectorizer.vocabulary_.items():
    frequency_of_current_word = vectors[:, index].sum()
    word_and_frequency_tuples.append((word, frequency_of_current_word))
by_freq = sorted(word_and_frequency_tuples, key=lambda x: x[1], reverse=True)
print(by_freq[0:5])

vocab_size = len(word_and_frequency_tuples)
print('Number of words in vocab = {}'.format(vocab_size))

indices_to_words = {index : word for word, index in vectorizer.vocabulary_.items()}




nmf = NMF(n_components=2)
nmf_projections = nmf.fit_transform(vectors)


# Inspect what each component is about (i.e. what the topic is)
for component_id in range(len(nmf.components_)):
    print('\nComponent {}'.format(component_id))
    nmf.components_[component_id]
    word_indices_in_descending_order_by_importance_for_component = sorted(
        range(vocab_size),
        key=(lambda word_index: nmf.components_[component_id, word_index]),
        reverse=True
    )
    for index in word_indices_in_descending_order_by_importance_for_component[:5]:
        word = indices_to_words[index]
        importance = nmf.components_[component_id, index]
        print('{} : {:.3f}'.format(word, importance))

# For each component, find the sentence that has the largest projection along that component
for component_id in range(len(nmf.components_)):
    id_of_recipe_with_greatest_projection_along_component = max(
        range(len(data)),
        key=(lambda recipe_index: nmf_projections[recipe_index, component_id])
    )
    print('\nComponent {}:'.format(component_id))
    print('The recipe that most strongly embodies this component is:')
    print(data[id_of_recipe_with_greatest_projection_along_component, 1])

[('broccoli', 3), ('eggs', 2), ('sugar', 2), ('water', 2), ('mix', 1)]
Number of words in vocab = 11

Component 0
broccoli : 1.291
water : 0.798
made : 0.493
soup : 0.493
over : 0.305

Component 1
eggs : 0.914
sugar : 0.914
goes : 0.513
well : 0.513
mix : 0.401

Component 0:
The recipe that most strongly embodies this component is:
Broccoli soup is made with water and broccoli

Component 1:
The recipe that most strongly embodies this component is:
Sugar goes well with eggs


In [19]:
data[id_of_recipe_with_greatest_projection_along_component]

array(['4.8', 'Sugar goes well with eggs'], dtype='<U45')

In [3]:
for word, index in vectorizer.vocabulary_.items():
    print(word)

mix
eggs
sugar
pour
water
over
broccoli
goes
well
soup
made


In [21]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

feature_names = ['Cakeyness', 'Veganness', 'Frenchness']
train_features = np.array([
      [4.5, 0.1, 5.0],
      [8.2, 0.2, 0.0],
      [0.3, 5.6, 0.0],
      [0.4, 8.9, 5.0]
    ])
train_ratings = np.array([
      4.9,
      4.9,
      0.1,
      0.1
    ])

validation_features = np.array([
      [3.5, 0.1, 2.5],
      [0.4, 6.9, 2.5]
    ])
validation_ratings = np.array([
      4.2,
      0.3
    ])

X_train = train_features
y_train = train_ratings

def train_and_evaluate_regressor(regressor):
    regressor.fit(X_train, y_train)
    features_and_importances = []
    for feature_id in range(len(regressor.feature_importances_)):
        feature_name = feature_names[feature_id]
        feature_importance = regressor.feature_importances_[feature_id]
        features_and_importances.append((feature_name, feature_importance))
    features_and_importances.sort(key=(lambda pair: pair[1]), reverse=True)
    for pair in features_and_importances[:5]:
        print('Feature {}: importance = {:.3f}'.format(pair[0], pair[1]))
    
    train_predictions = regressor.predict(train_features)
    validation_predictions = regressor.predict(validation_features)
    mean_abs_error_on_train = mean_absolute_error(train_ratings, train_predictions)
    mean_abs_error_on_validation = mean_absolute_error(validation_ratings, validation_predictions)
    print('Mean abs error on train = {:.3f}'.format(mean_abs_error_on_train))
    print('Mean abs error on validation = {:.3f}'.format(mean_abs_error_on_validation))
        
    


for learning_rate in [0.1, 0.05, 0.025, 0.010, 0.005, 0.001]:
    print('\nLearning rate = {:.3f}'.format(learning_rate))
    regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=learning_rate)
    train_and_evaluate_regressor(regressor)
    
    # High learning rates => overfitted
    # Low learning rates => underfitted


Learning rate = 0.100
Feature Cakeyness: importance = 0.524
Feature Veganness: importance = 0.476
Feature Frenchness: importance = 0.000
Mean abs error on train = 0.000
Mean abs error on validation = 0.450

Learning rate = 0.050
Feature Veganness: importance = 0.622
Feature Cakeyness: importance = 0.378
Feature Frenchness: importance = 0.000
Mean abs error on train = 0.014
Mean abs error on validation = 0.436

Learning rate = 0.025
Feature Veganness: importance = 0.524
Feature Cakeyness: importance = 0.476
Feature Frenchness: importance = 0.000
Mean abs error on train = 0.191
Mean abs error on validation = 0.259

Learning rate = 0.010
Feature Veganness: importance = 0.520
Feature Cakeyness: importance = 0.480
Feature Frenchness: importance = 0.000
Mean abs error on train = 0.878
Mean abs error on validation = 0.428

Learning rate = 0.005
Feature Veganness: importance = 0.506
Feature Cakeyness: importance = 0.494
Feature Frenchness: importance = 0.000
Mean abs error on train = 1.454
Me

# Test NMF stemmer

In [69]:
df = pd.DataFrame(index=['1334','1456','1998'], columns=['title','ingredients'])
df['title'] = ['apple pie', 'beef stew', 'chicken nuggets']
df['ingredients'] = ['apples, pastry', 'beef, potatoes', 'chicken, breadcrumbs']
df

Unnamed: 0,title,ingredients
1334,apple pie,"apples, pastry"
1456,beef stew,"beef, potatoes"
1998,chicken nuggets,"chicken, breadcrumbs"


In [70]:
df.loc['1334', 'title']

'apple pie'

In [71]:
df.at['1334','title']

'apple pie'

In [72]:
for index in df.index:
    print (index)

1334
1456
1998


In [75]:
df

Unnamed: 0,title,ingredients
1334,apple pie,"apples, pastry"
1456,beef stew,"beef, potatoes"
1998,chicken nuggets,"chicken, breadcrumbs"


In [76]:
def make_nmf_S(col, df, nmf_df, n=4):
    # input = df, column of interest, number of NMF components to keep
    # output = augmented df that now contains n new columns, each corresponding to an NMF components.
    # text from each row in the column of interest is expressed in terms of the NMF components
    
    print(f"\n************\nNow working on column '{col}':")
    
    # obtain data
    # cell = a string
    data = df  
    display(data)
    
    # let's stem
    stemmer = snowballstemmer.stemmer('english')
    # will generate a list with one item per recipe
    # each item will be a string of stemmed words
    data['stemmed'] = ''

    for index in data.index:
        print(index)
        # stem
        item = data.loc[index, col]
        print(item)
        item_stem = stemmer.stemWords(item.split())

        # generate a single string per recipe
        data_string = ' '.join(item_stem)

        # remove strange quotation marks
        # (Ig ideally would know how not to generate them in the first place)
        data_string = data_string.replace("'","") 
        data_string = data_string.replace('"','')

        # append the string to df
        data.at[index, 'stemmed'] = data_string
    print('Stemming done') 
    
    
    data = df['stemmed'].to_numpy()
    display(data)
    
    
    # data = df[col].to_numpy()
    # tokenise (make into a bag of words)
    vectorizer = CountVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(data).todense()
    print(f"vocabulary size: {vectors.shape[1]}")
    print('Tokenising done.')
    
    # Print vocab items with their frequencies, sorted in descending order by frequency
    word_and_frequency_tuples = []
    for word, index in vectorizer.vocabulary_.items():
        frequency_of_current_word = vectors[:, index].sum()
        word_and_frequency_tuples.append((word, frequency_of_current_word))
    by_freq = sorted(word_and_frequency_tuples, key=lambda x: x[1], reverse=True)
    print(f'Most freqeunt words are:\n{by_freq[0:5]}')
    vocab_size = len(word_and_frequency_tuples)
    
    # might be useful some time
    indices_to_words = {index : word for word, index in vectorizer.vocabulary_.items()}

    # do NMF
    nmf = NMF(n_components=n)
    nmf_projections = nmf.fit_transform(vectors)
    #display(nmf_projections)
    print('NMF transforming done.')

    # Inspect individual components
    for component_id in range(len(nmf.components_)):
        print(f'\n***\nComponent {component_id} for {col}')
        
        ### Inspect what each component is about (i.e. what the topic is)
        print('\nMost important words are:')
        
        # BTW nmf.components_ is an np.array
        # nmf.components_[component_id]
        word_indices_in_descending_order_by_importance_for_component = sorted(
            range(vocab_size),
            key=(lambda word_index: nmf.components_[component_id, word_index]),
            reverse=True
        )
        for index in word_indices_in_descending_order_by_importance_for_component[:5]:
            word = indices_to_words[index]
            importance = nmf.components_[component_id, index]
            print('{} : {:.3f}'.format(word, importance))

        ### find the sentence that has the largest projection along that component
        id_of_recipe_with_greatest_projection_along_component = max(
            range(len(data)),
            key=(lambda recipe_index: nmf_projections[recipe_index, component_id])
        )
        print('\nThe entry that most strongly embodies this component is:')
        print(data[id_of_recipe_with_greatest_projection_along_component])

    # generate column names for nmf df
    col_names_list = []
    for num in range(1, nmf_projections.shape[1] + 1):
        col_name = f"{col}_nmf_{num}"
        col_names_list.append(col_name)

    # generate nmf df
    our_col_nmf_df = pd.DataFrame(nmf_projections, columns=col_names_list)
    our_col_nmf_df
    print('\n***\nnp.array made into pd.df')

    # set index to match recipe_id
    our_col_nmf_df['recipe_id'] = df.index
    our_col_nmf_df.set_index('recipe_id', inplace=True)
    print("Index now reset back to recipe_id.")
    
    return our_col_nmf_df

In [77]:
# generate a master nmf_df
nmf_df = pd.DataFrame(index=df.index)

# try NMF
col = 'title'
our_col_nmf_df = make_nmf_S(col, df, nmf_df, n=4)


************
Now working on column 'title':


Unnamed: 0,title,ingredients
1334,apple pie,"apples, pastry"
1456,beef stew,"beef, potatoes"
1998,chicken nuggets,"chicken, breadcrumbs"


1334
apple pie
1456
beef stew
1998
chicken nuggets
Stemming done


array(['appl pie', 'beef stew', 'chicken nugget'], dtype=object)

vocabulary size: 6
Tokenising done.
Most freqeunt words are:
[('appl', 1), ('pie', 1), ('beef', 1), ('stew', 1), ('chicken', 1)]
NMF transforming done.

***
Component 0 for title

Most important words are:
appl : 0.487
pie : 0.487
beef : 0.000
chicken : 0.000
nugget : 0.000

The entry that most strongly embodies this component is:
appl pie

***
Component 1 for title

Most important words are:
stew : 0.887
beef : 0.812
appl : 0.000
chicken : 0.000
nugget : 0.000

The entry that most strongly embodies this component is:
beef stew

***
Component 2 for title

Most important words are:
nugget : 1.831
chicken : 1.831
appl : 0.000
beef : 0.000
pie : 0.000

The entry that most strongly embodies this component is:
chicken nugget

***
Component 3 for title

Most important words are:
beef : 0.313
stew : 0.182
appl : 0.000
chicken : 0.000
nugget : 0.000

The entry that most strongly embodies this component is:
beef stew

***
np.array made into pd.df
Index now reset back to recipe_id.


In [79]:
our_col_nmf_df

Unnamed: 0_level_0,title_nmf_1,title_nmf_2,title_nmf_3,title_nmf_4
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1334,2.051446,0.0,0.0,0.0
1456,0.0,1.007735,0.0,0.581224
1998,0.0,0.0,0.546009,0.0
