In [1]:
import os
import pandas as pd
import numpy as np
import warc
import gzip
import boto
from boto.s3.key import Key
from gzipstream import GzipStreamFile
from mrjob.job import MRJob
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from __future__ import print_function
from pyspark.mllib.clustering import LDAModel#, LDA
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.clustering import LDA, DistributedLDAModel, LocalLDAModel
from pyspark.ml.feature import CountVectorizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import lit


In [2]:
sc = ps.SparkContext('local[4]')
sqlContext = ps.SQLContext(sc)

In [3]:
ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
DSI_CSTON_BUCKET = "galvanize.dsi.capstone.alex"
OUT_BUCKET = "s3a://{}:{}@{}".format(ACCESS_KEY, SECRET_KEY, DSI_CSTON_BUCKET)

In [4]:
wet_path_file_name = 'wet_2016_list/2016-Dec-wet.path'
dec_2016_wet_list = sc.textFile("s3a://%s/%s" % (DSI_CSTON_BUCKET, wet_path_file_name))
first_file = dec_2016_wet_list.first()

In [5]:
conn = boto.connect_s3(ACCESS_KEY, SECRET_KEY, host='s3.amazonaws.com')
pds = conn.get_bucket('commoncrawl')
k = Key(pds, first_file)
f = warc.WARCFile(fileobj=GzipStreamFile(k))

In [6]:
recfilepath = "s3a://%s/recfiles/%s.%s" % (DSI_CSTON_BUCKET, first_file[:-3], 'rec')
urlfilepath = "s3a://%s/recfiles/%s.%s" % (DSI_CSTON_BUCKET, first_file[:-3], 'url')

data = []
url = []

for i, document in enumerate(f):
  if document['Content-Type'] != 'text/plain':
    continue
  dat = document.payload.read()
  data.append(str(dat))
  url.append(str(document.url)) 

In [7]:
rec_df = pd.DataFrame(
    {'contents': data,
     'url': url
    })

In [8]:
rec_df = sqlContext.createDataFrame(rec_df)
rec_df.printSchema()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)



In [9]:
def get_site(url):
   if len(url) <= 3: return ''
   site = url.split('/')[2]
   return site
udf_url_to_website = udf(get_site, StringType())
rec_df = rec_df.withColumn("website", udf_url_to_website("url"))
rec_df.show()

+--------------------+--------------------+--------------------+
|            contents|                 url|             website|
+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|
|Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|http://03online.c...|        03online.com|
|Ð¨ÑÐ¼ Ð² Ð¿ÑÐ°Ð...|http://03online.c...|        03online.com|
|Ð§Ð¸ÑÑÑÐ¹ ÐºÐ»...|http://08.od.ua/b...|            08.od.ua|
|Ð£ÐºÑÐ¾Ð¿ÑÑÐ¾Ñ...|http://08.od.ua/k...|            08.od.ua|
|ÐÐ° Ð¾Ð»ÑÐ³Ð¸Ðµ...|http://08.od.ua/n...|            08.od.ua|
|Orcead, Ð¾Ð¾Ð¾ Ð²...|htt

In [14]:
no_of_website = rec_df.select("website").rdd.flatMap(lambda x: x).map(lambda site: (site,1)).reduceByKey(lambda v1,v2: v1+v2)
no_of_website.count()

28982

## Crawl LDA

In [10]:
num_topics = 100
max_iterations = 100
vocab_size = 1000

In [11]:
# rec_rdd_df.select(monotonically_increasing_id().alias("rowId"),"*")
# rec_rdd_df = rec_rdd_df.withColumn("doc_id", lit(monotonically_increasing_id().alias("rowId")))
# rec_rdd_df.show()

In [12]:
tokenizer = RegexTokenizer(inputCol="contents", outputCol="words")
rec_df = tokenizer.transform(rec_df)
rec_df.show()

+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|             website|               words|
+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|[çç, çç, ...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|[æ¿æ©ç´ é£â§ç...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|[ðð¾ð¿ñð¾ññ, ...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|[ð, ð²ð¸ð±ñð°ñ...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|[ððµñðµð»ð¾ð¼, ...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|[ðð¾ñðµð¼ñ, ð½...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|[ðð¾ñðµð¼ñ, ð½...|
|Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|http://03online.c...|        03online.com|[ð ðµð±ðµð½ð¾ðº, ...|
|Ð¨ÑÐ¼ Ð² Ð¿ÑÐ°Ð...|http://03online.c...|        03o

## CountVectorizing

In [13]:
cvectorizer = CountVectorizer(inputCol="words", outputCol="features", vocabSize=vocab_size)
cv_model = cvectorizer.fit(rec_df) ## takes long time
#mbf = 2.0 / max_iterations + 1.0 / tokens_df.count() #MiniBatchFraction
rec_df = cv_model.transform(rec_df)

In [14]:
rec_df.printSchema()
rec_df.show()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)
 |-- website: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|             website|               words|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|[çç, çç, ...|(1000,[6,7,10,15,...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|[æ¿æ©ç´ é£â§ç...|(1000,[7,206],[1....|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|[ðð¾ð¿ñð¾ññ, ...|(1000,[3,22,24,33...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|[ð, ð²ð¸ð±ñð°ñ...|(1000,[3,22,24,33...|
|ÐÐµÑ

## For mllib
### pyspark.mllib: pyspark.rdd.RDD
### pyspark.ml: pyspark.sql.DataFrame

In [None]:
from pyspark.sql.types import Row
from pyspark.sql.functions import *

sv_df = rec_df.select("cvectors")
corpus = sv_df.rdd.map(lambda x: Row(cvectors=DenseVector(x[0].toArray())))\
                  .map(lambda x: x[0])\
                  .zipWithIndex()\
                  .map(lambda x: [x[1], x[0]])
#                 .cache()

In [40]:
corpus_df = corpus.toDF()

In [20]:
type(corpus)

pyspark.rdd.PipelinedRDD

In [None]:
## Java Error
crawl_lda_model_10_topic = LDA.train(corpus, k=10, maxIterations=10, optimizer="em")
## EMR Resize

In [15]:
# Save and load model
#crawl_lda_model_10_topic.save(sc, "crawl_lda_model_10")
crawl_lda_model_10_topic = LDAModel.load(sc, "crawl_lda_model_10")
crawl_lda_model_5_topic = LDAModel.load(sc, "crawl_lda_model")

In [18]:
print("Learned topics (as distributions over vocab of " + str(crawl_lda_model_10_topic.vocabSize())
      + " words):")
topics = crawl_lda_model_10_topic.topicsMatrix()

Learned topics (as distributions over vocab of 1000 words):


In [17]:
for topic in range(510):
    print("Topic " + str(topic) + ":")
    for word in range(0, crawl_lda_model_10_topic.vocabSize()):
        print(" " + str(topics[word][topic]))

#avg_log_likelihood = crawl_lda_model.ldaModel.logLikelihood / documents.count()


Topic 0:
 67395.4895797
 46397.3208226
 55926.3904784
 130654.655224
 37624.1222236
 38215.886328
 31877.8550866
 23796.8057749
 8343.86809151
 21024.3462638
 17621.2854536
 15460.1399226
 15992.4721622
 14581.2954557
 13114.0891733
 13320.2846905
 11635.9482807
 11963.1601379
 11179.6078395
 10666.7763538
 9208.25588182
 22397.9175293
 10721.8151816
 10301.7420972
 8754.21291237
 7512.56611231
 8133.41033363
 87223.4163423
 8130.31935674
 8128.75883536
 8134.22510281
 7871.04588757
 2472.74218516
 5390.57565143
 7032.28011899
 8004.67472136
 6898.49265706
 1589.97324333
 11980.7213231
 6349.84986925
 6381.26591112
 7193.99235397
 2454.35842893
 5812.674946
 5735.84840329
 5627.74948994
 6741.58205
 5717.80673788
 5095.64556179
 19621.6716343
 4878.75590559
 4436.32073981
 5190.19885395
 4654.00966212
 4732.17164204
 4196.44501308
 4940.63995612
 4173.91469391
 5572.38625797
 3602.42597294
 1592.48056448
 4621.5399127
 4472.50598315
 8436.83971341
 4059.03794767
 3955.661714
 3972.0414

IndexError: index 10 is out of bounds for axis 0 with size 10

In [19]:
# Print topics, showing top-weighted 10 terms for each topic.
vocab_list = cv_model.vocabulary
topic_indices = crawl_lda_model_10_topic.describeTopics(maxTermsPerTopic=100)

for terms_idx, term_weights in topic_indices:
    terms_lst = [vocab_list[idx] for idx in terms_idx]
    print ('TOPIC: {}'.format(zip(terms_lst, term_weights)))
    
# topicIndices.foreach { case (terms, termWeights) =>
#   println("TOPIC:")
#   terms.zip(termWeights).foreach { case (term, weight) =>
#     println(s"${vocabArray(term.toInt)}\t$weight")
#   }
#   println()
# }

TOPIC: [(u'-', 0.0630054187517242), (u'\xe2\x96\xba', 0.04206163080982868), (u'the', 0.03250003634142811), (u'and', 0.026969308099534985), (u'to', 0.02237411764913712), (u'a', 0.01842879549098448), (u'of', 0.01814342988242096), (u'in', 0.015372415205578562), (u'&', 0.011475501659212404), (u'(1)', 0.01080091766102067), (u'for', 0.010138542236128627), (u'\xf0\xb4\xf0\xbb\xf1\x8f', 0.009462132335112481), (u'is', 0.008497488796285162), (u'on', 0.007712028352389875), (u'|', 0.0074553224920506245), (u'page', 0.007405565297236213), (u'(2)', 0.007318199263894411), (u'i', 0.007031518506206225), (u'only', 0.006435917512604095), (u'by', 0.0064234229800774045), (u'you', 0.006323989592997402), (u'(3)', 0.005809541281750981), (u'0', 0.005777447138157023), (u'your', 0.005768978631415141), (u'with', 0.005611187697372693), (u'this', 0.005391127259896055), (u'1', 0.005170366521861598), (u'all', 0.00514382522194762), (u'(4)', 0.005134302881625816), (u'at', 0.0049677952430839025), (u'screen', 0.0045584413

## For ml
### pyspark.mllib: pyspark.rdd.RDD
### pyspark.ml: pyspark.sql.DataFrame

In [44]:
dlda = LDA(k=10, seed=1)

In [45]:
crawl_ml_lda_model_10_topics = dlda.fit(rec_df)

In [25]:
model_path = "crawl_ml_lda_model_10_topics"
#crawl_ml_lda_model_10_topics.save(model_path)
if crawl_ml_lda_model_10_topics.isDistributed():
    crawl_ml_lda_model_10_topics = DistributedLDAModel.load(model_path)
else:
    crawl_ml_lda_model_10_topics = LocalLDAModel.load(model_path)

False

In [22]:
#crawl_ml_lda_model_10_topics.save("crawl_ml_lda_model_10_topics")
crawl_ml_lda_model_10_topics = LocalLDAModel.load("crawl_ml_lda_model_10_topics")

In [23]:
avg_log_likelihood = crawl_ml_lda_model_10_topics.logLikelihood(rec_df) / rec_df.count()

In [24]:
avg_log_likelihood

-2011.9285832343262

In [28]:
crawl_ml_lda_model_10_topics.topicsMatrix()

DenseMatrix(1000, 10, [236606.2612, 110506.9378, 110057.927, 10687.7156, 113694.8787, 92914.5845, 73856.0682, 6687.9971, ..., 0.6679, 91.7154, 98.6573, 514.8735, 711.011, 540.6104, 603.2262, 19.3826], 0)

### Distributed LDA

In [29]:
dlda = LDA(k=10, seed=1).setTopicDistributionCol("topics")

In [30]:
crawl_ml_dlda_model_10_topics = dlda.fit(rec_df)

In [51]:
crawl_ml_dlda_model_10_topics.save("crawl_ml_dlda_model_10_topics")

In [52]:
crawl_ml_dlda_model_10_topics = LocalLDAModel.load("crawl_ml_dlda_model_10_topics")

In [34]:
transformed = crawl_ml_dlda_model_10_topics.transform(rec_df)

In [36]:
transformed.printSchema()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)
 |-- website: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- topics: vector (nullable = true)



In [49]:
doc_topics = transformed.select("topics").collect()

In [57]:
np.array(doc_topics).T.shape

(10, 1, 54320)