In [None]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
import pymongo
import certifi
import re
import string

from os import path, getcwd
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
def read_from_food_metadata(collection): # collection is string
    
    global df_dicts
    
    client = pymongo.MongoClient('your mongodb uri', ssl_ca_certs=certifi.where())
    db = client.food_metadata
    
    data = db.get_collection(collection)
    
    item_details1 = data.find()
    dicts1 = []
    for item in item_details1:
        dicts1.append(item)
    
    df_dicts = pd.DataFrame.from_dict(dicts1)


In [None]:
def preprocessing(collection):
    
    global df_dicts, df_new, df
     
    df = df_dicts.copy()
    df['food_id'] = df.groupby(['food_name']).ngroup()
    
    def replace_columns(column, context):
        df[column] = [str(i).replace(context, "") for i in df[column]]
    
    def new_column(column):
        df[column] = [float(str(i).replace(",", "")) for i in df[column]]
    
    #rating
    replace_columns("rating", "Ratings")
    replace_columns("rating", "Rating")
    new_column("rating")
    
    #review_count
    replace_columns("review_count", "Reviews")
    replace_columns("review_count", "Review")
    new_column("review_count")
    
    #photo_count
    replace_columns("photo_count", "Photos")
    replace_columns("photo_count", "Photo")
    new_column("photo_count")
    
    #total_time
    df[['sayi1', 'mins_hr', 'sayi2', 'mins']] = df['total_time'].str.split(' ', expand=True, n=3).fillna(0)
    df['sayi1'] = df['sayi1'].astype(int)
    df['sayi2'] = df['sayi2'].astype(int)
    df['sayi1'] = np.where(df['mins_hr'] == 'hr', df['sayi1'] * 60 + df['sayi2'], df['sayi1'])
    df.drop(['mins_hr', 'sayi2', 'mins'], axis=1, inplace=True)
    df = df.rename(columns={'sayi1': 'total_time_new'})
    df["total_time_new"].replace(0, np.nan, inplace=True)
    
    #for lowcholesterol collection
    if (i == 'lowcholesterol'):
        chainlist = df.iloc[0:353, :]
        chainstr = df.iloc[353:943, :]
        chainstr["chain"] = chainstr["chain"].apply(eval)
        df = pd.concat([chainlist, chainstr], axis=0)
    
    #for regex funcs
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
    
    
    #ingredients
    if (i == 'gluten_free') | (i ==  'keto') | (i == 'vegan'):
        df['ingredients_new'] = df['ingredients'].apply(eval)
        df["ingredients_new"] = [','.join(map(str, l)) for l in df['ingredients_new']]
    else:
        df["ingredients_new"] = [','.join(map(str, l)) for l in df['ingredients']] #liste olduğundan strye çevirdim ki map uygulansın
    
    df['ingredients_new'] = df['ingredients_new'].map(alphanumeric).map(punc_lower).str.strip()
    words = ["teaspoon","teaspoons", "tablespoon","tablespoons", "fluid ounce","gill","cup","cups", "pint","quart",
             "gallon","ml","liter", "pound","ounce","ounces", "mg","gram","kg","length","mm","cm","meter","inch", 
             "chopped", "taste", "water", "ground", "large", "sliced", "diced", 'cut', 'into', 'black', 'pepper', 
             "salt", 'white', 'sugar', 'olive', 'oil', 'clove', 'garlic', 'onion', 'minced', 'tomato', 'optional', 
             'fresh', 'crushed', 'drained', 'rinsed', 'purpose', 'flour', 'lemon', 'juice', 'clove', 'cooking', 'sprey',
             'finely', 'dried', 'small', 'baking', 'powder', 'vanilla', 'extract', 'heavy', 'whipping', 'pinch', 'thinly',
             'peeled', 'extra', 'virgin', "½", "¼", "⅓", "¾", "⅛", "⅔"]
    df['ingredients_new'] = df['ingredients_new'].apply(lambda x: " ".join(x for x in x.split() if x not in words))
    #for WordCloud
    text=" ".join(i for i in df['ingredients_new'])
    wordcloud=WordCloud(width=600, height=250, max_font_size=50, max_words=2000, background_color="white").generate(text)
    plt.figure(figsize=[50,50])
    #plt.imshow(wordcloud, interpolation="bilinear")
    #plt.axis("off")
    wordcloud.to_file(collection + '.png');
    #for stopwords in ingredients
    import nltk
    #nltk.download('stopwords')
    from nltk.corpus import stopwords
    sw = stopwords.words('english')
    df['ingredients_new'] = df['ingredients_new'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
    #for lemmi in ingredients
    from textblob import Word
    #nltk.download('wordnet')
    df['ingredients_new'] = df['ingredients_new'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
    #nutrition
    replace_columns("nutrition", ". Full Nutrition")
    nutrition_list = ["protein", "carbohydrates", "fat", "cholesterol", "sodium"]
    df['nutrition_new'] = df['nutrition'].apply(lambda x: " ".join(x for x in x.split() if x not in nutrition_list))
    replace_columns("nutrition_new", " calories")
    replace_columns("nutrition_new", "g")
    replace_columns("nutrition_new", "m")
    df[["calories", "protein_gr", "carbohydrates_gr", "fat_gr", "cholesterol_mg_sodium_mg"]] = df['nutrition_new'].str.split(';', expand=True, n=4)
    df['calories'] = df['calories'].str.strip()
    df['protein_gr'] = df['protein_gr'].str.strip()
    df['carbohydrates_gr'] = df['carbohydrates_gr'].str.strip()
    df['fat_gr'] = df['fat_gr'].str.strip()
    df[['cho', 'so']] = df["cholesterol_mg_sodium_mg"].str.split(';', expand=True)
    df['cho'] = df['cho'].str.strip()
    df['so'] = df['so'].str.strip()
    df[['cho', 'so']] = df[['cho', 'so']].fillna(0)
    df['so'] = np.where(df['so'] == 0, df['cho'], df['so']) #Analiz icin unknown degil de 0 yaptim
    df['cho'] = np.where(df['cho'] == df['so'], 0, df['cho']) #Analiz icin unknown degil de 0 yaptim
    df.drop(["nutrition_new", 'cholesterol_mg_sodium_mg'], axis=1, inplace=True)
    df = df.rename(columns = {'cho': 'cholesterol_mg', 'so': 'sodium_mg'})
    df[['rating', 'review_count', 'photo_count', 'total_time_new', 'calories', 'protein_gr', 'carbohydrates_gr', 
        'fat_gr', 'cholesterol_mg', 'sodium_mg']] = df[['rating', 'review_count', 'photo_count', 'total_time_new', 
                                                        'calories', 'protein_gr', 'carbohydrates_gr', 'fat_gr', 
                                                        'cholesterol_mg', 'sodium_mg']].astype(float)
    
    df_new = df.copy()
    
    #chain
    if (collection == 'gluten_free') | (collection ==  'keto') | (collection == 'vegan'):
        df_new["chain"] = df_new["chain"].apply(eval)
        print('bu collection', collection)
    else:
        print('gluten,keto,vegan degil')
        
    df_new = df_new.explode('chain').reset_index(drop=True)
    
    if (collection == 'vegan') | (collection ==  'vegetarian'):
        df_new[["users", "stars", 'Nonecolumn']] = df_new['chain'].str.split(pat="Rating: ", expand=True)
    else:    
        df_new[["users", "stars"]] = df_new['chain'].str.split(pat="Rating: ", expand=True)
        
    df_new[['stars', "reviews"]] = df_new['stars'].str.split(pat=" stars       ", expand=True)
    df_new[["redundant1", "redudant2", 'reviews']] = df_new['reviews'].str.split(pat="/", n=2, expand=True)
    df_new.drop(["chain", "redundant1", "redudant2"], axis=1, inplace=True)
    df_new["stars"] = df_new["stars"].astype(float)
    df_new["users"] = df_new["users"].str.strip()
    
    #reviews
    df_new["reviews"] = df_new["reviews"].str.replace(" Read More     ", "")
    df_new["reviews"] = df_new["reviews"].str.replace('(\d+)', '').str.strip()
    df_new = df_new.drop_duplicates(subset="reviews")
    df_new.drop(['_id', 'index'], axis=1, inplace=True)
    
    #for filling NaNs
    df_new['total_time_new'].fillna(df_new['total_time_new'].median(), inplace=True)
    fill_columns= ['photo_count', 'calories', 'carbohydrates_gr', 'fat_gr', 'cholesterol_mg', 'sodium_mg']
    for column in fill_columns:
        df_new[column] = df_new[column].fillna(0)
    
    #for recommendation
    df_new['userid'] = df_new.groupby(['users']).ngroup()
    

In [None]:
def load_to_food_recommendation(collection_name): # collection_name is string
    
    global df_new
    
    client = pymongo.MongoClient('your mongodb uri', ssl_ca_certs=certifi.where())
    db = client.food_recommendation
    collection = db[collection_name]

    df_new.reset_index(inplace=True)
    data_dict = df_new.to_dict("records")

    collection.insert_many(data_dict)

In [None]:
collection = ['diabetic', 'gluten_free', 'keto', 'lowsodium', 'low_cholesterol', 'vegan', 'vegetarian']

In [None]:
for i in collection:
    read_from_food_metadata(i)
    preprocessing(i)
    load_to_food_recommendation(i)
    print(i)

bu collection vegan
vegan
gluten,keto,vegan degil
vegetarian


<Figure size 3600x3600 with 0 Axes>

<Figure size 3600x3600 with 0 Axes>