In [44]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)

In [5]:
shakespeare_RDD = sc.textFile('pg100.txt')

In [6]:
import re

def remove_punctuation(text):
    return re.sub(r'[^a-zA-Z0-9_ ]', '', text.lower()).strip()

In [47]:
shakespeare_pair = shakespeare_RDD.filter(lambda l: l!="").zipWithIndex().map(lambda (l, num): (num, remove_punctuation(l)))
shakespeare_pair.take(10)
# shakespeare = sqlContext.createDataFrame(shakespeare_pair, ["label", "sentence"])
# shakespeare.take(10)

[(0,
  u'the project gutenberg ebook of the complete works of william shakespeare by'),
 (1, u'william shakespeare'),
 (2, u'this ebook is for the use of anyone anywhere at no cost and with'),
 (3, u'almost no restrictions whatsoever  you may copy it give it away or'),
 (4, u'reuse it under the terms of the project gutenberg license included'),
 (5, u'with this ebook or online at wwwgutenbergorg'),
 (6, u'this is a copyrighted project gutenberg ebook details below'),
 (7, u'please follow the copyright guidelines in this file'),
 (8, u'title the complete works of william shakespeare'),
 (9, u'author william shakespeare')]

## TF-IDF 

In [14]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [15]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(shakespeare)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "label").take(3):
  print(features_label)

Row(features=SparseVector(20, {1: 1.4962, 3: 1.8524, 5: 0.8358, 6: 1.4844, 9: 1.6455, 11: 1.1623, 14: 1.186, 15: 1.3522, 18: 1.5727, 19: 1.0818}), label=0)
Row(features=SparseVector(20, {11: 1.1623, 14: 1.186}), label=1)
Row(features=SparseVector(20, {1: 2.2443, 3: 2.7786, 7: 1.3045, 10: 2.066, 14: 2.372, 17: 0.9354, 18: 1.5727}), label=2)


## Word2Vec

In [59]:
from pyspark.ml.feature import Word2Vec

In [60]:
shakespeare_text = shakespeare_RDD.filter(lambda l: l!="").map(lambda l: (remove_punctuation(l).split(' '),))
shakespeare_text.take(3)

[([u'the',
   u'project',
   u'gutenberg',
   u'ebook',
   u'of',
   u'the',
   u'complete',
   u'works',
   u'of',
   u'william',
   u'shakespeare',
   u'by'],),
 ([u'william', u'shakespeare'],),
 ([u'this',
   u'ebook',
   u'is',
   u'for',
   u'the',
   u'use',
   u'of',
   u'anyone',
   u'anywhere',
   u'at',
   u'no',
   u'cost',
   u'and',
   u'with'],)]

In [62]:
sentenceDataFrame = sqlContext.createDataFrame(shakespeare_text, ["text"])
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(sentenceDataFrame)
result = model.transform(sentenceDataFrame)
for feature in result.select("result").take(3):
  print(feature)

Row(result=DenseVector([-0.2539, -0.8469, 1.5684]))
Row(result=DenseVector([-0.2704, -1.4223, 2.4582]))
Row(result=DenseVector([0.1871, -0.043, 0.1224]))


## Tokenizer

In [63]:
from pyspark.ml.feature import Tokenizer

In [64]:
sentenceDataFrame = sqlContext.createDataFrame(shakespeare_pair, ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsDataFrame = tokenizer.transform(sentenceDataFrame)
for words_label in wordsDataFrame.select("words", "label").take(3):
  print(words_label)

Row(words=[u'the', u'project', u'gutenberg', u'ebook', u'of', u'the', u'complete', u'works', u'of', u'william', u'shakespeare', u'by'], label=0)
Row(words=[u'william', u'shakespeare'], label=1)
Row(words=[u'this', u'ebook', u'is', u'for', u'the', u'use', u'of', u'anyone', u'anywhere', u'at', u'no', u'cost', u'and', u'with'], label=2)
