This jupyter notebook is intended to process the data from google survey into a dataset ready for the Machine Learning (in RBM.ipynb) to utilize.
Foods and Drinks are set to be alphanumeric.

In [None]:
import pandas as pd
import re
import math

In [None]:
file_dir='survey data final.xlsx'

In [None]:
#take raw data from excel file
data=pd.read_excel(file_dir)
data.head()

In [None]:
#array for case that easier to solve than using regex
specialcasefood={'soto':'soto/sop/makanan berkuah lainnya'}
specialcasedrink={}
stupidcase=['etc','random','','wedang','cincau']

In [None]:
#minimum amount of food/drink selected. Below these, they won't be included to database
food_threshold=3
drink_threshold=2

Data Processing for food

In [None]:
#get food row from pandas dataframe and make it lowercase
foodarr=data['Kalau jajan di pedagang kecil, Kamu sukanya jajanan makanan apa? (Bisa isi lebih dari satu)'].str.lower().str.strip(' ')

In [None]:
#regexing to make the data more uniform
foodarr=foodarr.replace(r"ci[a-z]*","gorengan kering",regex=True)
foodarr=foodarr.replace(r"(cakwe|basreng)","gorengan kering",regex=True)
foodarr=foodarr.replace(r"sate[a-z ]*","sate",regex=True)
foodarr=foodarr.replace(r"kebab[a-z ]*","kebab",regex=True)
foodarr=foodarr.replace(r"soto$","soto/sop/makanan berkuah lainnya",regex=True)
foodarr=foodarr.replace(r"bala bala","gorengan kering",regex=True)
foodarr=foodarr.replace(r"lotek","bakso/batagor/siomay",regex=True)
foodarr=foodarr.replace(r"(burger|dimsum|kebab|macaroni|sosis|takoyaki)","luar negeri",regex=True)
foodarr=foodarr.replace(r"(lumpia basah|seblak)","gorengan basah",regex=True)
foodarr=foodarr.replace(r"ayam bakar/goreng/fried chicken","ayam",regex=True)
foodarr=foodarr.replace(r"bakso/batagor/siomay","bakso",regex=True)
foodarr=foodarr.replace(r"buah potong/rujak","buah",regex=True)
foodarr=foodarr.replace(r"gorengan lain","gorengan kering",regex=True)
foodarr=foodarr.replace(r"(kue-kue|martabak)","kue",regex=True)
foodarr=foodarr.replace(r"mie-miean","mie",regex=True)
foodarr=foodarr.replace(r"nasi-nasian","nasi",regex=True)
foodarr=foodarr.replace(r"roti bakar/kukus","roti",regex=True)
foodarr=foodarr.replace(r"asinan","buah",regex=True)
foodarr=foodarr.replace(r"soto/sop/makanan berkuah lainnya","berkuah",regex=True)

In [None]:
#count the amount of food, picked from survey
fooddict={}
for i in foodarr:
    #use dict to prevent duplicate
    temp=dict.fromkeys(i.split(', '))
    #skip if below threshold
    if len(temp) < food_threshold:
      continue
    #split the data
    for j in temp:
        if j in stupidcase:
            continue
        if j not in specialcasefood:
            processedname=j.replace(' ',"")
            if j in fooddict:
                fooddict[processedname]=fooddict[processedname]+1
            else:
                fooddict[processedname]=1
        else:
            entry=specialcasefood[j]
            processedname=entry.replace(' ',"")
            if entry in fooddict:
                fooddict[processedname]=fooddict[processedname]+1
            else:
                fooddict[processedname]=1

In [None]:
#view the data, sorted by amount
dict(sorted(fooddict.items(), key=lambda item: item[1]))

In [None]:
#use pandas to turn the data into categorical index
food_cat=pd.CategoricalIndex(fooddict)
#make hot encoded food array
hotencoded_foodarr=[]
for i in foodarr:
    temparr=[]
    temp=dict.fromkeys(i.split(', '))
    if len(temp) < food_threshold:
        hotencoded_foodarr.append([])
        continue
    for j in food_cat:
        if j in i.split(', '):
            temparr.append(1)
        else:
            temparr.append(0)
    hotencoded_foodarr.append(temparr)

Data Processing for drink

In [None]:
#get food row from pandas dataframe and make it lowercase
drinkarr=data['Kalau jajan di pedagang kecil, Kamu sukanya minuman apa? (Bisa isi lebih dari satu)'].str.lower().str.strip(' ')

In [None]:
#regexing to make the data more uniform
drinkarr=drinkarr.replace(r"wedang[a-z ]*","wedang",regex=True)
drinkarr=drinkarr.replace(r"(air[a-z ]*|mineral[a-z ]*|aqua[a-z ]*)","air mineral",regex=True)
drinkarr=drinkarr.replace(r"([a-z][a-z ]*latte[a-z ]*|capucino[a-z ]*)","kopi",regex=True)
drinkarr=drinkarr.replace(r"[a-z][a-z ]*jeruk","jus buah",regex=True)
drinkarr=drinkarr.replace(r"teh[a-z ]*","teh",regex=True)
drinkarr=drinkarr.replace(r"(pop ice|minuman saset)","soda/soft drink",regex=True)
drinkarr=drinkarr.replace(r"es boba","bubble tea",regex=True)
drinkarr=drinkarr.replace(r"susu[a-z ]*","susu",regex=True)
drinkarr=drinkarr.replace(r"(es degan[a-z ]*|es kelapa[a-z ]*)","air kelapa",regex=True)
drinkarr=drinkarr.replace(r"random","",regex=True)
drinkarr=drinkarr.replace(r"hi","",regex=True)
drinkarr=drinkarr.replace(r"soda/soft drink","soft drink",regex=True)

In [None]:
#count the amount of food, picked from survey
drinkdict={}
for i in drinkarr:
    #use dict to prevent duplicate
    temp=dict.fromkeys(i.split(', '))
    #skip if below threshold
    if len(temp)<drink_threshold:
      continue
    for j in temp:
        if j in stupidcase:
            continue
        if j not in specialcasedrink:
            processedname=j.replace(' ',"")
            if j in drinkdict:
                drinkdict[processedname]=drinkdict[processedname]+1
            else:
                drinkdict[processedname]=1
        else:
            entry=specialcasedrink[j]
            processedname=entry.replace(' ',"")
            if entry in drinkdict:
                drinkdict[processedname]=drinkdict[processedname]+1
            else:
                drinkdict[processedname]=1

In [None]:
dict(sorted(drinkdict.items(), key=lambda item: item[1]))

In [None]:
drink_cat=pd.CategoricalIndex(drinkdict)

In [None]:
#use pandas to turn the data into categorical index
hotencoded_drinkarr=[]
for i in drinkarr:
    temparr=[]
    temp=dict.fromkeys(i.split(', '))
    if len(temp)<drink_threshold:
        hotencoded_drinkarr.append([])
        continue
    for j in drink_cat:
        if j in i.split(', '):
            temparr.append(1)
        else:
            temparr.append(0)
    hotencoded_drinkarr.append(temparr)

Final data processing

In [None]:
#Convert food and drink hot encoded array into an array filled with ratings. 0 means 1 rating and 1 means 5 rating.
#check again if the value is below threshold
# also count the total of entries for generating user id
label=[]
rating=[]

counter=0
userarr=[]

for i in range(len(hotencoded_foodarr)):
    if len(hotencoded_foodarr[i])==0 or len(hotencoded_drinkarr[i])==0:
        counter+=1
        continue
    for j in range(len(hotencoded_foodarr[i])):
        label.append(food_cat[j])
        if hotencoded_foodarr[i][j]:
            rate=5
        else:
            rate=1
        rating.append(rate)
    for j in range(len(hotencoded_drinkarr[i])):
        label.append(drink_cat[j])
        if hotencoded_drinkarr[i][j]:
            rate=5
        else:
            rate=1
        rating.append(rate)
    userarr.append(counter)
    counter+=1

In [None]:
#generate user ID
userid=[]
total_cat=len(food_cat)+len(drink_cat)
for i in range(len(label)):
    userid.append(userarr[math.floor(i/total_cat)])

In [None]:
#Create the data and export it
output=pd.DataFrame([userid,label,rating]).T.rename({0:'userid',1:'foodanddrinkname',2:'rating'},axis=1)
output.to_csv('RetrievalV5.csv',index=False)

The parts below is added to send list of possible foods and drinks to firebase

In [None]:
import firebase_admin as fb
from firebase_admin import firestore

In [None]:
cred = fb.credentials.Certificate('credential.json')
fb.initialize_app(cred, {'databaseURL': 'https://keenam-cap0428-default-rtdb.asia-southeast1.firebasedatabase.app'})
db = firestore.client()
batch = db.batch()

In [None]:
Fooddoc = db.collection('DefaultVal').document('FoodDrink')
Fooddoc.set(
    {
        'Food':list(fooddict.keys()),
        'Drink':list(drinkdict.keys())
    }
)
    