In [None]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt

In [2]:
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Input, Conv2D, Concatenate, MaxPool2D, Reshape, Flatten, Dropout
from keras.layers import ReLU
from keras.layers import BatchNormalization, LSTM, Bidirectional

from keras.callbacks import CSVLogger, EarlyStopping, ReduceLROnPlateau
from keras.layers import Add

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [3]:
from keras.optimizers import Adam, SGD
from keras.callbacks import LearningRateScheduler
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from sklearn.manifold import TSNE
from keras import backend as K

In [4]:
# !python3 -m pip install gensim

In [5]:
from gensim.models import Word2Vec

In [6]:
%matplotlib inline

In [7]:
# !python3 -m pip install -U nltk
# !python3 -m nltk.downloader punkt

In [8]:
import nltk

In [9]:
from nltk.corpus import stopwords

In [10]:
path = 'recipes_with_calories.csv'

In [11]:
df = pd.read_csv(path, sep="|", index_col=0)

In [12]:
all_columns = list(df.columns)
len(all_columns)

680

In [13]:
df.dropna(axis=0).shape, df.shape

((15864, 680), (20052, 680))

In [14]:
for column in df.columns:
    if df[column].isnull().any():
        print(column)
#         df[column] = df[column].fillna(df[column].mean())

protein
fat
sodium


In [15]:
def get_small_dict(df, keep_columns, index):
    my_dict = df.iloc[index][keep_columns].to_dict()
    my_dict['tags'] = [
        key for key, value in df.iloc[index].to_dict().items() if value != 0 and key not in keep_columns]
    my_dict['title'] = my_dict['title'].strip()
    my_dict['tags'] = [
        str(unicodedata.normalize('NFKD', t).encode('ascii','ignore'), encoding='ascii') for t in my_dict['tags']
    ]
    return my_dict

In [16]:
keep_columns = ['title', 'rating', 'calories', 'protein', 'fat','sodium']

In [17]:
def remove_outliers_with_tukey(df):
    print('There were {} samples before outlier removal.'.format(len(df)))

    # Remove calories outliers by the Tukey rule
    Q3 = np.percentile(df[regr_column], 75)
    Q1 = np.percentile(df[regr_column], 25)
    IQR = Q3 - Q1

    outliers = df[regr_column].loc[
        (df[regr_column] < (Q1 - 1.5 * IQR)) | 
        (df[regr_column] > (Q3 + 1.5 * IQR))
    ].index
    
    df = df.drop(outliers)
    
    print('There are now {} samples.'.format(len(cut_df)))

    return df

In [24]:
# pick regression column
regr_column = 'protein'
cut_df = df[df[regr_column] == df[regr_column]]

cut_df = remove_outliers_with_tukey(df=cut_df)

# transform df in list of dictionaries for processing
regr_data = []
for i in range(len(cut_df)):
    regr_data.append(get_small_dict(cut_df, keep_columns, i))

There were 15890 samples before outlier removal.
There are now 15890 samples.


In [25]:
def remove_punctuation_signs(regr_data):
    for d in regr_data:
        if 'tags' not in d:
            raise ValueError('Expected to have <tags> key in each of the regression input data dictionary!')
    corpus = ['|'.join(
        d['tags'] + 
        [w.lower() for w in nltk.word_tokenize(d['title']) if
             w not in '''!()-[]{};:'"\,<>./?@#$%^&*_~''']
    ) for d in regr_data]
    return corpus

In [26]:
# make sure the data doesn't have any punctuation signs
corpus = remove_punctuation_signs(regr_data)

In [27]:
# pick tokenization method
def tokenize(word_list):
    return word_list.split('|')

In [28]:
def get_y_labels(regr_column, regr_data):
    return [d[regr_column] for d in regr_data]

In [29]:
y_label = get_y_labels(regr_column, regr_data)

In [30]:
if len(corpus) != len(y_label):
    print('[ERROR] There is a mismatch between the size of the input data ' +
          ' ({} elements) and the size of labels ({} elements)!'.format(len(corpus), len(y_label)))

## Try a vanilla regression (CountVectorizer + LinearRegression from sklearn)

In [34]:
# fit vectorizer
vectorizer = CountVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(corpus)

# train, test data
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y_label, test_size=0.2, random_state=42)

# sklearn LinearRegression model
regr_sklearn = LinearRegression()
regr_sklearn.fit(X_train, y_train)

# predict data
y_pred = regr_sklearn.predict(X_test)

# how far off is the result? metric = mean squared error
y_test = np.array(y_test)
y_pred = np.array(y_pred)
print("Mean squared error: ", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))
print("Std: ", np.std(y_test))
print("Mean: ", np.mean(y_test))

# show first k differences
first_k = 10
for yp, yt in zip(y_pred[:first_k], y_test[:first_k]):
    print(yp, yt)

Mean squared error:  119807508358.9083
Std:  15.518228806405345
Mean:  13.820304568527918
18.89750363295439 12.0
11.564207810810837 8.0
8.126576203755173 13.0
11.729228753498825 4.0
-49846010989.26628 2.0
7.23050381605986 3.0
2.9005344099319306 3.0
0.5286853498733368 4.0
18.47845723097685 20.0
38.56466141947109 35.0


## Try Word2Vec embeddings + Keras NN

### Word2Vec embedding

In [32]:
vocab = set()
for word_list in corpus:
    vocab |= set(tokenize(word_list))

print("Choose embedding size between {} and {}".format(np.log2(len(vocab)), np.power(len(vocab), 1/4) * 10))

w2v_corpus = []
for word_list in corpus:
    w2v_corpus.append(tokenize(word_list))

Choose embedding size between 12.523316912312747 and 87.59382856164761


In [35]:
EMBEDDING_SIZE = 50

In [36]:
w2v_model = Word2Vec(w2v_corpus, size=EMBEDDING_SIZE, window=5, min_count=1, workers=-1)

print("Embedding size is {}".format(w2v_model.wv.vector_size))

Embedding size is 50


In [37]:
sequence_lengths = [len(w) for w in w2v_corpus]

print("Sequence length should be {} ".format(pd.Series(sequence_lengths).describe()['mean']))

Sequence length should be 17.03228863467136 


In [38]:
FIXED_LENGTH = 17

In [39]:
def generate_embeddings_matrix(sample, model, fixed_length=17):
    embedding_size = model.wv.vector_size
    embeddings = []
    for sen in sample:
        for word in sen:
            if word not in model.wv.vocab:
                continue
            embeddings.append(model.wv[word])
            if len(embeddings) >= fixed_length:
                break

    embeddings = np.array(embeddings)

    while len(embeddings) < fixed_length:
        padding_length = fixed_length - len(embeddings)
        embeddings = np.concatenate((embeddings, embeddings[:padding_length]))
    if len(embeddings) > fixed_length:
        embeddings = embeddings[:fixed_length]

    return embeddings

In [40]:
def generate_input_matrix(sample, fixed_length=17):
    
    inputs = np.array(sample)

    while len(inputs) < fixed_length:
        padding_length = fixed_length - len(inputs)
        inputs = np.concatenate((inputs, inputs[:padding_length]))
    if len(inputs) > fixed_length:
        inputs = inputs[:fixed_length]

    return inputs

In [41]:
# Generate list of input data (embedding matrix) and make sure every input has FIXED_LENGTH elements
X = []
for sample in w2v_corpus:
    embedded_sample = generate_embeddings_matrix(sample, w2v_model, FIXED_LENGTH)
    if embedded_sample.shape != (FIXED_LENGTH, EMBEDDING_SIZE):
        print(embedded_sample.shape)
    X.append(embedded_sample)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.2, random_state=42)

### Keras NN

In [43]:
FIXED_LENGTH = 17
EMBEDDING_SIZE = 50

BATCH_SIZE = 16
EPOCHS=200

In [44]:
def baseline_model():
    # Create model
    input_layer = Input(shape=(FIXED_LENGTH, EMBEDDING_SIZE), name='input')
    flat = Flatten()(input_layer)
    z = Dense(32, activation='relu', name='fc1', kernel_initializer='normal')(flat)
    output = Dense(1, kernel_initializer='normal')(z)
    model = Model(inputs=[input_layer], outputs=[output])
    # Compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [45]:
model = baseline_model()

In [46]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 17, 50)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 850)               0         
_________________________________________________________________
fc1 (Dense)                  (None, 32)                27232     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 27,265
Trainable params: 27,265
Non-trainable params: 0
_________________________________________________________________


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.2, random_state=42)

In [48]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [49]:
model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200


Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x7fb60c387748>

In [50]:
y_pred = model.predict(X_test)

y_pred = y_pred.flatten()

# how far off is the result? metric = mean squared error
y_test = np.array(y_test)
y_pred = np.array(y_pred)
print("Mean squared error: ", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))
print("Std: ", np.std(y_test))
print("Mean: ", np.mean(y_test))

# show first k differences
first_k = 10
print("Showing the first {} differences between predicted and real.".format(first_k))
for yp, yt in zip(y_pred[:first_k], y_test[:first_k]):
    print(yp, yt)

Mean squared error:  14.582119125951243
Std:  15.518228806405345
Mean:  13.820304568527918
Showing the first 10 differences between predicted and real.
15.908257 12.0
7.369284 8.0
15.097214 13.0
24.635046 4.0
11.819252 2.0
11.392851 3.0
8.336748 3.0
13.75466 4.0
14.047829 20.0
18.395899 35.0


### Save model

In [None]:
model.save('models/predict_protein_model')

### Get the dataframe that has NaN values for our regression column (calories)

In [None]:
predict_df = df[df[regr_column] != df[regr_column]]

### Get the input data we need to predict the needed values

In [None]:
predict_data = []
for i in range(len(predict_df)):
    predict_data.append(get_small_dict(predict_df, keep_columns, i))
corpus = ['|'.join(
    d['tags'] + 
    [w.lower() for w in nltk.word_tokenize(d['title']) if w not in '''!()-[]{};:'"\,<>./?@#$%^&*_~''']
) for d in predict_data]

### Split input data using the tokenizer

In [None]:
predict_corpus = []
for word_list in corpus:
    predict_corpus.append(tokenize(word_list))

### Make sure input data has a fixed shape

In [None]:
X = []
for sample in predict_corpus:
    sample = [w for w in sample if w in w2v_model.wv]
    input_sample = generate_embeddings_matrix(sample, w2v_model, FIXED_LENGTH)
    if input_sample.shape != (FIXED_LENGTH, EMBEDDING_SIZE):
        print(input_sample.shape)
    X.append(input_sample)

### Predict missing data 

In [None]:
y_pred = model.predict(np.array(X))

y_pred = y_pred.flatten()

### Fill the dataframe with predicted values

In [None]:
predict_df[regr_column] = y_pred

### Update initial dataframe with missing values

In [None]:
update_regr_column = [np.nan for _ in range(len(df))]
for idx, value in zip(predict_df.index, y_pred):
    update_regr_column[idx] = value

In [None]:
df.update({regr_column: update_regr_column})

### Save updated dataframe

In [None]:
df.to_csv('recipes_with_calories_and_protein.csv', sep='|')