In [1]:
import os
import pandas as pd
import numpy as np
import pyspark as ps
sc = ps.SparkContext('local[4]')
sqlContext = ps.SQLContext(sc)

In [2]:
ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")

In [3]:
DSI_CSTON_BUCKET = "galvanize.dsi.capstone.alex"
OUT_BUCKET = "s3a://{}:{}@{}".format(ACCESS_KEY, SECRET_KEY, DSI_CSTON_BUCKET)

In [4]:
s3file = 's3a://mybucket.alex.seong/data/cancer_rates1.csv'

In [5]:
rdd1 = sc.textFile(s3file)

In [6]:
def casting_function((no, cancer_rate)):
  return (int(no), float(cancer_rate))

In [7]:
rdd2 = rdd1.map(lambda rowstr: rowstr.split(','))\
           .filter(lambda row: not row[0].startswith('0'))\
           .map(casting_function)

In [8]:
rdd1.first()

u'0,0.00224719101124'

In [9]:
wet_path_file_name = 'wet_2016_list/2016-Dec-wet.path'

In [10]:
dec_2016_wet_list = sc.textFile("s3a://%s/%s" % (DSI_CSTON_BUCKET, wet_path_file_name))

In [11]:
first_file = dec_2016_wet_list.first()

In [12]:
import warc
import gzip
import boto
from boto.s3.key import Key
from gzipstream import GzipStreamFile
from mrjob.job import MRJob

In [13]:
#conn = boto.connect_s3(anon=True)
conn = boto.connect_s3(ACCESS_KEY, SECRET_KEY, host='s3.amazonaws.com')
pds = conn.get_bucket('commoncrawl')
k = Key(pds, first_file)
f = warc.WARCFile(fileobj=GzipStreamFile(k))

In [14]:
recfilepath = "s3a://%s/recfiles/%s.%s" % (DSI_CSTON_BUCKET, first_file[:-3], 'rec')
urlfilepath = "s3a://%s/recfiles/%s.%s" % (DSI_CSTON_BUCKET, first_file[:-3], 'url')

data = []
url = []

for i, document in enumerate(f):
  if document['Content-Type'] != 'text/plain':
    continue
  dat = document.payload.read()
  data.append(str(dat))
  url.append(str(document.url)) 

In [15]:
rec_df = pd.DataFrame(
    {'contents': data,
     'url': url
    })

In [70]:
rec_rdd_df = sqlContext.createDataFrame(rec_df)

In [71]:
rec_rdd_df.printSchema()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)



In [72]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

def get_site(url):
   if len(url) <= 3: return ''
   site = url.split('/')[2]
   return site

udf_url_to_website = udf(get_site, StringType())

In [73]:
rec_rdd_df = rec_rdd_df.withColumn("website", udf_url_to_website("url"))

In [68]:
#rec_rdd_df = rec_rdd_df.rdd.zipWithIndex().toDF()
# filter(lambda (key,index) : key == [1,2]).\
# map(lambda (key,index) : index).collect()

In [69]:
rec_rdd_df.show()

+--------------------+---+
|                  _1| _2|
+--------------------+---+
|[çç
çç	ç...|  0|
|[æ¿æ©ç´ é£â§ç...|  1|
|[ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð...|  2|
|[Ð Ð²Ð¸Ð±ÑÐ°ÑÐ...|  3|
|[ÐÐµÑÐµÐ»Ð¾Ð¼ Ð...|  4|
|[ÐÐ¾ÑÐµÐ¼Ñ Ð½Ð...|  5|
|[ÐÐ¾ÑÐµÐ¼Ñ Ð½Ð...|  6|
|[Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|  7|
|[Ð¨ÑÐ¼ Ð² Ð¿ÑÐ°...|  8|
|[Ð§Ð¸ÑÑÑÐ¹ ÐºÐ...|  9|
|[Ð£ÐºÑÐ¾Ð¿ÑÑÐ¾...| 10|
|[ÐÐ° Ð¾Ð»ÑÐ³Ð¸Ð...| 11|
|[Orcead, Ð¾Ð¾Ð¾ Ð...| 12|
|[X-trim, Ð¼Ð°Ð³Ð°...| 13|
|[Ð¡Ð²Ð°Ð´ÑÐ±Ð° Ð...| 14|
|[Nude woman with ...| 15|
|[Belladona Font S...| 16|
|[New Gems Layer S...| 17|
|[ÐÐ°Ð»ÐµÐ½Ð´Ð°Ñ...| 18|
|[Image - Dale 2.P...| 19|
+--------------------+---+
only showing top 20 rows



## Feature Extraction

In [74]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [23]:
tokenizer = Tokenizer(inputCol="contents", outputCol="words")

In [29]:
rtokenizer = RegexTokenizer(inputCol="contents", outputCol="tokens")

In [86]:
rec_rdd_df.printSchema()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)
 |-- website: string (nullable = true)



In [30]:
words_rdd = tokenizer.transform(rec_rdd_df)

In [31]:
tokens_rdd  = rtokenizer.transform(rec_rdd_df)

In [32]:
tokens_rdd.show()

+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|             website|              tokens|
+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|[çç, çç, ...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|[æ¿æ©ç´ é£â§ç...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|[ðð¾ð¿ñð¾ññ, ...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|[ð, ð²ð¸ð±ñð°ñ...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|[ððµñðµð»ð¾ð¼, ...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|[ðð¾ñðµð¼ñ, ð½...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|[ðð¾ñðµð¼ñ, ð½...|
|Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|http://03online.c...|        03online.com|[ð ðµð±ðµð½ð¾ðº, ...|
|Ð¨ÑÐ¼ Ð² Ð¿ÑÐ°Ð...|http://03online.c...|        03o

In [33]:
hashing_tf = HashingTF(inputCol="words", outputCol="rawfeatures")#, numFeatures=30)

In [34]:
featurized_data = hashing_tf.transform(words_rdd)

In [35]:
idf = IDF(inputCol="rawfeatures", outputCol="features")

In [36]:
idf_model = idf.fit(featurized_data)

In [37]:
rescaled_data = idf_model.transform(featurized_data)

In [38]:
rescaled_data.select("features", "website").take(10)

[Row(features=SparseVector(262144, {1837: 7.3191, 7706: 7.5354, 10625: 6.8083, 14623: 7.5354, 15889: 1.5426, 23560: 5.3812, 33953: 8.0123, 44473: 7.4062, 51598: 6.9708, 52296: 7.0525, 54961: 1.8405, 61285: 6.9906, 65569: 7.2917, 81948: 3.7606, 84509: 7.4369, 89219: 6.5332, 98778: 4.6185, 107290: 7.4062, 113458: 0.6991, 118631: 7.1891, 119044: 7.8581, 119708: 6.5332, 133143: 0.6583, 139477: 7.1185, 143550: 7.8581, 153116: 8.0695, 153727: 4.2879, 158186: 6.5719, 158425: 2.0742, 167122: 0.9608, 167222: 5.9783, 175912: 5.342, 176235: 38.6231, 179505: 6.9906, 188105: 6.5206, 197123: 7.6838, 197274: 6.546, 199407: 8.1946, 204762: 7.4369, 210566: 5.6662, 214294: 7.6838, 216159: 7.3191, 222453: 0.4722, 225541: 7.8116, 227334: 8.0695, 232934: 7.5354, 255885: 3.4863, 259831: 7.2391}), website=u'007zhenrenyulecheng.bcwebwang2.com'),
 Row(features=SparseVector(262144, {10267: 7.5015, 17584: 7.0108, 75196: 7.7246, 82106: 7.074, 89110: 7.6446, 100602: 7.0108, 113458: 0.6991, 122969: 1.3127, 229557: 

In [39]:
rescaled_data.select("website", "url").take(10)

[Row(website=u'007zhenrenyulecheng.bcwebwang2.com', url=u'http://007zhenrenyulecheng.bcwebwang2.com/nvmbr3/qiupan.html'),
 Row(website=u'0289646723.tranews.com', url=u'http://0289646723.tranews.com/'),
 Row(website=u'03online.com', url=u'http://03online.com/news/9453'),
 Row(website=u'03online.com', url=u'http://03online.com/news/o_vibratsii/2016-5-15-164829'),
 Row(website=u'03online.com', url=u'http://03online.com/news/perelom_goleni/2015-1-4-55342'),
 Row(website=u'03online.com', url=u'http://03online.com/news/pochemu_ne_proishodit_beremennost_kak_produvayut_truby/2016-10-18-208719'),
 Row(website=u'03online.com', url=u'http://03online.com/news/pochemu_nemeyut_konechnosti/2013-1-30-4013'),
 Row(website=u'03online.com', url=u'http://03online.com/news/rebenok_sel_kusochek_torta_s_alkogolnoy_propitkoy/2015-11-8-116889'),
 Row(website=u'03online.com', url=u'http://03online.com/news/shum_v_pravom_uhe/2014-11-25-48516'),
 Row(website=u'08.od.ua', url=u'http://08.od.ua/bytovye_uslugi/himch

In [40]:
rescaled_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|             website|               words|         rawfeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|[çç, çç, ...|(262144,[1837,770...|(262144,[1837,770...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|[æ¿æ©ç´ é£â§ç...|(262144,[10267,17...|(262144,[10267,17...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|[ðð¾ð¿ñð¾ññ, ...|(262144,[1438,203...|(262144,[1438,203...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|[ð, ð²ð¸ð±ñð°ñ...|(262144,[310,1325...|(262144,[310,1325...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|[ððµñðµð»ð¾ð¼, ...|(262144,[310,1325..

## LDA on Rec_Data

In [141]:
from __future__ import print_function
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

In [142]:
features_rdd = rescaled_data.select("features").rdd
data_rdd = sc.parallelize(data)

TypeError: 'RDD' object is not iterable

In [143]:
data = sc.textFile("s3a://galvanize.dsi.capstone.alex/lda_data/sample_lda_data.txt")

In [144]:
data.collect()

[u'1 2 6 0 2 3 1 1 0 0 3',
 u'1 3 0 1 3 0 0 2 0 0 1',
 u'1 4 1 0 0 4 9 0 1 2 0',
 u'2 1 0 3 0 0 5 0 2 3 9',
 u'3 1 1 9 3 0 2 0 0 1 3',
 u'4 2 0 3 4 5 1 1 1 4 0',
 u'2 1 0 3 0 0 5 0 2 2 9',
 u'1 1 1 9 2 1 2 0 0 1 3',
 u'4 4 0 3 4 2 1 3 0 0 0',
 u'2 8 2 0 3 0 2 0 2 7 2',
 u'1 1 1 9 0 2 2 0 0 3 3',
 u'4 1 0 0 4 5 1 3 0 1 0']

In [145]:
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
#parsed_data = features_rdd.map(lambda line: Vectors.sparse(line.asDict()['features']).)

In [146]:
parsedData.collect()

[DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0]),
 DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0]),
 DenseVector([1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, 2.0, 0.0]),
 DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 3.0, 9.0]),
 DenseVector([3.0, 1.0, 1.0, 9.0, 3.0, 0.0, 2.0, 0.0, 0.0, 1.0, 3.0]),
 DenseVector([4.0, 2.0, 0.0, 3.0, 4.0, 5.0, 1.0, 1.0, 1.0, 4.0, 0.0]),
 DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 2.0, 9.0]),
 DenseVector([1.0, 1.0, 1.0, 9.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0, 3.0]),
 DenseVector([4.0, 4.0, 0.0, 3.0, 4.0, 2.0, 1.0, 3.0, 0.0, 0.0, 0.0]),
 DenseVector([2.0, 8.0, 2.0, 0.0, 3.0, 0.0, 2.0, 0.0, 2.0, 7.0, 2.0]),
 DenseVector([1.0, 1.0, 1.0, 9.0, 0.0, 2.0, 2.0, 0.0, 0.0, 3.0, 3.0]),
 DenseVector([4.0, 1.0, 0.0, 0.0, 4.0, 5.0, 1.0, 3.0, 0.0, 1.0, 0.0])]

In [147]:
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

In [148]:
corpus.take(5)

[[0, DenseVector([1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, 0.0, 3.0])],
 [1, DenseVector([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0])],
 [2, DenseVector([1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, 2.0, 0.0])],
 [3, DenseVector([2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, 3.0, 9.0])],
 [4, DenseVector([3.0, 1.0, 1.0, 9.0, 3.0, 0.0, 2.0, 0.0, 0.0, 1.0, 3.0])]]

In [149]:
ldaModel = LDA.train(corpus, k=3)

In [150]:
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
          + " words):")

Learned topics (as distributions over vocab of 11 words):


In [151]:
topics = ldaModel.topicsMatrix()

In [152]:
#for topic in range(3):
#    print("Topic " )
topics

array([[  5.84849479,   8.45065465,  11.70085056],
       [  4.66848411,   9.97011102,  14.36140487],
       [  1.65321216,   3.56300958,   6.78377827],
       [ 14.71136737,  18.26365142,   7.02498121],
       [  2.22534537,   6.55193724,  16.22271739],
       [  2.95750587,  10.66059222,   8.38190191],
       [ 20.87603536,   7.30042069,   2.82354394],
       [  0.65449116,   1.79452399,   7.55098485],
       [  4.54202639,   2.08175421,   1.37621939],
       [  8.65444989,   7.46097881,   7.8845713 ],
       [ 21.8192584 ,   8.3726073 ,   2.8081343 ]])

In [54]:
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim import matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import pandas as pd
from gensim import corpora, models, similarities

In [108]:
# dictionary = corpora.Dictionary(parsed)
# corpus = [dictionary.doc2bow(text) for text in parsed]
# tfidf = models.TfidfModel(corpus)
# corpus_tfidf = tfidf[corpus]

# %time lda=LdaModel(corpus_tfidf, id2word=dictionary, num_topics=15, update_every=0, passes=200)


## Spark LDA

In [94]:
from pyspark.ml.feature import CountVectorizer, RegexTokenizer, StopWordsRemover
#from pyspark.ml.clustering import LDA
from pyspark.mllib.clustering import LDA
from pyspark.mllib.linalg import Vector
from pyspark.sql.functions import monotonically_increasing_id

In [76]:
num_topics = 100
max_iterations = 100
vocab_size = 10000

In [None]:
from pyspark.sql.functions import lit

In [77]:
doc_idx = np.arange(rec_rdd_df.count())

In [95]:
rec_rdd_df.select(monotonically_increasing_id().alias("rowId"),"*")

DataFrame[rowId: bigint, contents: string, url: string, website: string]

In [101]:
rec_rdd_df = rec_rdd_df.withColumn("doc_id", lit(monotonically_increasing_id().alias("rowId")))

In [102]:
rec_rdd_df.show()

+--------------------+--------------------+--------------------+------+
|            contents|                 url|             website|doc_id|
+--------------------+--------------------+--------------------+------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|     0|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|     1|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|     2|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|     3|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|     4|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|     5|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|     6|
|Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|http://03online.c...|        03online.com|     7|
|Ð¨ÑÐ¼ Ð² Ð¿ÑÐ°Ð...|http://03online.c...|        03online.com|     8|
|Ð§Ð¸ÑÑÑÐ¹ ÐºÐ»...|http://08.od.ua/b...|            08.od.ua|     9|
|Ð£ÐºÑÐ¾Ð¿ÑÑÐ¾Ñ...|http://08.od.ua/k...|            08.od.ua|

In [105]:
tokenizer = RegexTokenizer(inputCol="contents", outputCol="tokens")
tokens_df = tokenizer.transform(rec_rdd_df)
#vocab_rdd.take(10).flatMap(lambda x: x).distinct().collect()

In [107]:
tokens_df.show()

+--------------------+--------------------+--------------------+------+--------------------+
|            contents|                 url|             website|doc_id|              tokens|
+--------------------+--------------------+--------------------+------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|     0|[çç, çç, ...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|     1|[æ¿æ©ç´ é£â§ç...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|     2|[ðð¾ð¿ñð¾ññ, ...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|     3|[ð, ð²ð¸ð±ñð°ñ...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|     4|[ððµñðµð»ð¾ð¼, ...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|     5|[ðð¾ñðµð¼ñ, ð½...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|     6|[ðð¾ñðµð¼ñ, ð½...|
|Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|http://03online.c...|        03online.com|     7

## CountVectorizing

In [109]:
cv_model = CountVectorizer(inputCol="tokens", outputCol="cvectors")

In [111]:
cv_df = cv_model.fit(tokens_df) ## takes long time

In [119]:
mbf = 2.0 / max_iterations + 1.0 / tokens_df.count() #MiniBatchFraction

In [124]:
cvectors = cv_df.transform(tokens_df)

In [126]:
cvectors.printSchema()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)
 |-- website: string (nullable = true)
 |-- doc_id: long (nullable = false)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cvectors: vector (nullable = true)



In [130]:
cvectors.show()

+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|            contents|                 url|             website|doc_id|              tokens|            cvectors|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|     0|[çç, çç, ...|(262144,[6,7,10,1...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|     1|[æ¿æ©ç´ é£â§ç...|(262144,[7,206,13...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|     2|[ðð¾ð¿ñð¾ññ, ...|(262144,[3,22,24,...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|     3|[ð, ð²ð¸ð±ñð°ñ...|(262144,[3,22,24,...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|     4|[ððµñðµð»ð¾ð¼, ...|(262144,[3,24,33,...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|     5|[ðð¾ñðµð¼ñ, ð½

In [138]:
corpus = cvectors.select("cvectors").rdd.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

In [139]:
corpus.first()

[0,
 Row(cvectors=SparseVector(262144, {6: 1.0, 7: 1.0, 10: 2.0, 15: 1.0, 28: 1.0, 237: 1.0, 250: 1.0, 475: 2.0, 2515: 1.0, 3006: 1.0, 10165: 1.0, 20925: 1.0, 21349: 1.0, 60623: 1.0, 98332: 1.0}))]

In [140]:
lda_model = LDA.train(corpus, k = 10)

Py4JJavaError: An error occurred while calling o720.trainLDAModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 853.0 failed 1 times, most recent failure: Lost task 0.0 in stage 853.0 (TID 286, localhost, executor driver): net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for pyspark.sql.types._create_row)
	at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
	at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:99)
	at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)
	at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1349)
	at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1348)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1353)
	at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1353)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1353)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1326)
	at org.apache.spark.mllib.clustering.EMLDAOptimizer.initialize(LDAOptimizer.scala:166)
	at org.apache.spark.mllib.clustering.EMLDAOptimizer.initialize(LDAOptimizer.scala:80)
	at org.apache.spark.mllib.clustering.LDA.run(LDA.scala:329)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainLDAModel(PythonMLLibAPI.scala:552)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for pyspark.sql.types._create_row)
	at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
	at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:99)
	at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)
	at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1349)
	at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1348)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at scala.collection.AbstractIterator.to(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1353)
	at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1353)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [114]:
cv_df.vocabulary()

TypeError: 'list' object is not callable