## Segment recipe text for fitting word2vec model, and save recipe_vector json file locally

In [1]:
import re
import json
import jieba
import time
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

## Define a function to run in cluster, which segment all chinese characters

In [2]:
spark.sparkContext.addFile('mydict_3.txt')
def wordToSeg(x):
    if not jieba.dt.initialized:
        jieba.load_userdict('mydict_3.txt')
    
    try:
        interstate = re.sub(r'\W', '', x)
    except:
        interstate = x
        pass
    try:
        secondstate = interstate.replace('\n','')
    except:
        secondstate = interstate
        pass
    try:
        thirdstate = secondstate.replace('\n\n','')
    except:
        thirdstate = secondstate
        pass
    try:
        finalstate = re.sub(r'[a-zA-Z0-9]', '', thirdstate)
    except:
        finalstate = thirdstate
        pass
    try:
        seg = jieba.cut(finalstate, cut_all = False)
    except:
        output = finalstate
        pass
    try:
        output = ' '.join(seg)
    except:
        output = ''
        pass
    return output
    

### Register the UDF

In [3]:
spark.udf.register("word2seg", wordToSeg, StringType())
word2Seg = udf(wordToSeg, StringType())

### Load in all recipes information from HDFS

In [4]:
recipe = spark.read.json("hdfs://master/user/spark/spark101/recipe_com/recipe_to_spark.json")

### Creating temp view for following SQL query

In [5]:
recipe.createOrReplaceTempView("recipes")

### SQL query with pre-defined function

In [6]:
# Apply segment function on columns of ingredient, steps, comment and category
recipes_seg = spark.sql('''select url, img_url, title, time, author, word2Seg(ingredient) ingredient, 
                        word2Seg(steps) steps, word2Seg(comment) comment,
                        word2Seg(category) category from recipes''')

In [7]:
recipes_seg.createOrReplaceTempView("recipes_seg")

In [8]:
recipes_wordbag = spark.sql('''SELECT concat(ingredient, steps, comment, category) as text from recipes_seg''')

### Define another function for spliting words in each row

In [9]:
def wordToList(x):
    return x.split(' ')

### Register UDF for spliting words to lists

In [10]:
spark.udf.register("word2list", wordToList, ArrayType(StringType()))
word2list = udf(wordToList, ArrayType(StringType()))

### Getting ready to split the big string into list, then fit into word2vec training model

In [11]:
recipes_wordbag.createOrReplaceTempView("recipes_wordlist")

In [12]:
for_word2vec = spark.sql('''SELECT word2list(text) text from recipes_wordlist''')

### Spark MLlib word2vec training

In [13]:
from pyspark.ml.feature import Word2Vec

In [14]:
word2Vec = Word2Vec(vectorSize=150, minCount=3, inputCol="text", outputCol="result")
model = word2Vec.fit(for_word2vec)


## Save the model

In [None]:
model.save("hdfs://master/user/spark/spark101/recipe_com/recipe_word2vec_model")

## Transform all recipes into vector by using the word2vec model

In [15]:
recipes_wordseg = spark.sql('''SELECT url, img_url, title, author, concat(ingredient, steps, comment, category) as text from recipes_seg''')

In [16]:
recipes_wordseg.createOrReplaceTempView("recipes_wordseg")

In [17]:
recipes_wordseglist = spark.sql('''SELECT url, img_url, title, author, word2list(text) as text from recipes_wordseg''')

In [18]:
recipe_vector = model.transform(recipes_wordseglist)

In [20]:
# Transfer into Pandas DataFrame for reviewing
recipe_vector.toPandas()

Unnamed: 0,url,img_url,title,author,text,result
0,https://icook.tw/recipes/350499,https://imageproxy.icook.network/resize?height...,【減脂便當】鮪魚薯泥&甜豆蝦仁,R.L. 料理研究室,"[馬鈴薯, 蒜頭, 小瓣, 牛奶, 水煮鮪魚罐頭, 罐, 鹽適量, 甜豆, 蝦仁, 洋蔥半顆...","[-0.05569262317360633, -0.08170467016117818, -..."
1,https://icook.tw/recipes/350438,https://imageproxy.icook.network/resize?height...,【紙包鮭魚料理】烤箱｜爐連烤｜鮭魚｜健身,玫籽の健身/寶寶料理,"[洋蔥半顆, 小, 番茄顆, 酸豆, 適量, 蒜頭, 把, 檸檬顆, 蘆筍, 把, 鮭魚, ...","[0.0576996104284032, 0.000891230241944566, 0.0..."
2,https://icook.tw/recipes/350180,https://imageproxy.icook.network/resize?height...,【台鹽料理教室】雞肉溫沙拉,台鹽鹽選生活家,"[羅, 曼, 生菜, 片, 紅甜椒, 顆, 黃甜椒, 顆, 洋蔥半顆, 去骨雞腿排, 片鮮菇...","[-0.028010145874540735, -0.031009690147895317,..."
3,https://icook.tw/recipes/349771,https://imageproxy.icook.network/resize?height...,【日式蕎麥麵】｜爐連烤｜蕎麥麵｜健身,玫籽の健身/寶寶料理,"[昆布, 醬油少許, 蕎麥麵, 把, 雞胸肉塊, 雞蛋顆, 海苔, 數片炎熱, 的, 夏天,...","[0.0670748020027712, 0.04386279430899838, -0.0..."
4,https://icook.tw/recipes/349710,https://imageproxy.icook.network/resize?height...,墨西哥酪梨雞肉捲餅(夏日輕食料理),挑嘴小食堂,"[雞里肌肉, 克, 番茄顆, 墨西哥餅皮, 張, 雞蛋顆, 酪梨顆, 蘿蔓生菜, 把, 黑胡...","[0.023386845547252167, -0.0028286017947651157,..."
...,...,...,...,...,...,...
73863,https://cookpad.com/tw/%E9%A3%9F%E8%AD%9C/1223...,https://img-global.cpcdn.com/recipes/b684db144...,千層豬肉鍋,Lucia,"[白菜, 片, 金針菇, 少許, 豬梅花肉, 片盒, 烹大師匙白菜, 洗淨, 一葉, 一頁,...","[-0.015230400239312047, -0.0001222899586740714..."
73864,https://cookpad.com/tw/%E9%A3%9F%E8%AD%9C/1280...,https://img-global.cpcdn.com/recipes/b59bc3293...,沙茶培根豬,Jennifer 李,"[食材, 培根, 豬肉片, 公克, 半, 把, 空心菜, 瓣, 蒜頭, 根, 辣椒, 調味料...","[-0.0958413707602449, -0.10275696497410536, -0..."
73865,https://cookpad.com/tw/%E9%A3%9F%E8%AD%9C/1279...,https://img-global.cpcdn.com/recipes/2c83d0601...,醬燒蓮藕豬,Kit.CHEN 小廚娘 Chloe,"[豬肉, 盒切, 塊, 蓮藕, 棵, 切片, 台灣, 玉米筍, 紅蘿蔔, 熟, 毛豆豬肉, ...","[-0.11176704472460437, -0.053678994413976576, ..."
73866,https://cookpad.com/tw/%E9%A3%9F%E8%AD%9C/1272...,https://img-global.cpcdn.com/recipes/db2040b58...,豬肉腸煎蛋,麗麗,"[好市, 多, 豬肉, 腸條切, 丁, 蔥花, 根, 蛋顆, 鹽適量, 白胡椒粉適量所有, ...","[-0.06235234325994616, -0.06607790712429129, -..."


### Save recipe_vector to json on Hadoop hdfs

In [None]:
recipe_vector.write.json("hdfs://master/user/spark/spark101/recipe_com/recipe_vector.json")

### Or we can save recipe_vector json text file to local file system

In [19]:
recipe_vector.coalesce(1).write.format('json').save("/home/spark/Desktop/recipe_com/recipe_vector.json")

* Use the following command to download file from hdfs locally

hadoop fs -get /home/spark/Desktop/recipe_com/recipe_vector.json ~/Desktop/recipe_com/recipe_vector.json