In [1]:
import os
import re
import string
import pandas as pd
import numpy as np
import warc
import gzip
import boto
from boto.s3.key import Key
from gzipstream import GzipStreamFile
from mrjob.job import MRJob
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from __future__ import print_function
from pyspark.ml import Pipeline
from pyspark.mllib.clustering import LDAModel#, LDA
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.clustering import LDA, DistributedLDAModel, LocalLDAModel
from pyspark.ml.feature import CountVectorizer, RegexTokenizer, StopWordsRemover
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import lit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from pyspark.mllib.clustering import KMeans
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
sc = ps.SparkContext('local[4]')
sqlContext = ps.SQLContext(sc)

In [3]:
ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
DSI_CSTON_BUCKET = "galvanize.dsi.capstone.alex"
OUT_BUCKET = "s3a://{}:{}@{}".format(ACCESS_KEY, SECRET_KEY, DSI_CSTON_BUCKET)

In [4]:
wet_path_file_name = 'wet_2016_list/2016-Dec-wet.path'
no_of_doc_files = 1
dec_2016_wet_list = sc.textFile("s3a://%s/%s" % (DSI_CSTON_BUCKET, wet_path_file_name))
doc_files = dec_2016_wet_list.take(no_of_doc_files)

In [5]:
conn = boto.connect_s3(ACCESS_KEY, SECRET_KEY, host='s3.amazonaws.com')
pds = conn.get_bucket('commoncrawl')

In [6]:
data = []
url = []

for file in doc_files:
    k = Key(pds, file)
    f = warc.WARCFile(fileobj=GzipStreamFile(k))

    for i, document in enumerate(f):
        if document['Content-Type'] != 'text/plain':
            continue
        
        dat = document.payload.read()
        data.append(str(dat))
        url.append(str(document.url))

In [7]:
def no_punct(words):
    PUNCTUATION = set(string.punctuation)
    no_punctuation = []
    for word in words:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
        no_punctuation.append(punct_removed)
        
    return no_punctuation

In [8]:
rec_df = pd.DataFrame(
    {'contents': data,
     'url': url
    })

In [9]:
rec_df = sqlContext.createDataFrame(rec_df)
rec_df.printSchema()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)



In [10]:
rec_df.count()

54320

In [11]:
def get_site(url):
   if len(url) <= 3: return ''
   site = url.split('/')[2]
   return site
udf_url_to_website = udf(get_site, StringType())
rec_df = rec_df.withColumn("website", udf_url_to_website("url"))
rec_df.show()

+--------------------+--------------------+--------------------+
|            contents|                 url|             website|
+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|
|Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|http://03online.c...|        03online.com|
|Ð¨ÑÐ¼ Ð² Ð¿ÑÐ°Ð...|http://03online.c...|        03online.com|
|Ð§Ð¸ÑÑÑÐ¹ ÐºÐ»...|http://08.od.ua/b...|            08.od.ua|
|Ð£ÐºÑÐ¾Ð¿ÑÑÐ¾Ñ...|http://08.od.ua/k...|            08.od.ua|
|ÐÐ° Ð¾Ð»ÑÐ³Ð¸Ðµ...|http://08.od.ua/n...|            08.od.ua|
|Orcead, Ð¾Ð¾Ð¾ Ð²...|htt

In [12]:
no_of_website = rec_df.select("website").rdd.flatMap(lambda x: x).map(lambda site: (site,1)).reduceByKey(lambda v1,v2: v1+v2)
no_of_website.count()

28982

In [13]:
#rec_df.show()

## Crawl LDA

In [14]:
num_topics = 10
max_iterations = 10
vocab_size = 1000

In [15]:
tokenizer = RegexTokenizer(inputCol="contents", outputCol="words")
rec_df = tokenizer.transform(rec_df)
rec_df.show()

+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|             website|               words|
+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|[çç, çç, ...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|[æ¿æ©ç´ é£â§ç...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|[ðð¾ð¿ñð¾ññ, ...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|[ð, ð²ð¸ð±ñð°ñ...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|        03online.com|[ððµñðµð»ð¾ð¼, ...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|[ðð¾ñðµð¼ñ, ð½...|
|ÐÐ¾ÑÐµÐ¼Ñ Ð½Ðµ...|http://03online.c...|        03online.com|[ðð¾ñðµð¼ñ, ð½...|
|Ð ÐµÐ±ÐµÐ½Ð¾Ðº Ñ...|http://03online.c...|        03online.com|[ð ðµð±ðµð½ð¾ðº, ...|
|Ð¨ÑÐ¼ Ð² Ð¿ÑÐ°Ð...|http://03online.c...|        03o

In [16]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
rec_df = remover.transform(rec_df)

In [20]:
filtered = rec_df.select("filtered").rdd.map(no_punct).toDF().selectExpr("_")

In [26]:
filtered.printSchema()

root
 |-- filtered: string (nullable = true)



## CountVectorizing

In [23]:
cvectorizer = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=vocab_size)
cv_model = cvectorizer.fit(filtered) ## takes long time
#mbf = 2.0 / max_iterations + 1.0 / tokens_df.count() #MiniBatchFraction
#rec_df = cv_model.transform(rec_df)

IllegalArgumentException: u'requirement failed: Column filtered must be of type equal to one of the following types: [ArrayType(StringType,true), ArrayType(StringType,false)] but was actually of type StringType.'

In [24]:
rec_df.printSchema()
rec_df.show()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)
 |-- website: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|             website|               words|            filtered|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|007zhenrenyuleche...|[çç, çç, ...|[çç, çç, ...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|0289646723.tranew...|[æ¿æ©ç´ é£â§ç...|[æ¿æ©ç´ é£â§ç...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|        03online.com|[ðð¾ð¿ñð¾ññ, ...|[ðð¾ð¿ñð¾ññ, ...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|        03online.com|[ð

## For mllib
### pyspark.mllib: pyspark.rdd.RDD
### pyspark.ml: pyspark.sql.DataFrame

In [21]:
from pyspark.sql.types import Row
from pyspark.sql.functions import *

sv_df = rec_df.select("cvectors")
corpus = sv_df.rdd.map(lambda x: Row(cvectors=DenseVector(x[0].toArray())))\
                  .map(lambda x: x[0])\
                  .zipWithIndex()\
                  .map(lambda x: [x[1], x[0]])
#                 .cache()

AnalysisException: u"cannot resolve '`cvectors`' given input columns: [url, website, words, features, contents, filtered];;\n'Project ['cvectors]\n+- Project [contents#0, url#1, website#14, words#28, filtered#41, UDF(filtered#41) AS features#59]\n   +- Project [contents#0, url#1, website#14, words#28, UDF(words#28) AS filtered#41]\n      +- Project [contents#0, url#1, website#14, UDF(contents#0) AS words#28]\n         +- Project [contents#0, url#1, get_site(url#1) AS website#14]\n            +- LogicalRDD [contents#0, url#1]\n"

In [None]:
corpus_df = corpus.toDF()

In [None]:
type(corpus)

In [None]:
## Java Error
crawl_lda_model_10_topic = LDA.train(corpus, k=10, maxIterations=10, optimizer="em")
## EMR Resize

In [19]:
# Save and load model
#crawl_lda_model_10_topic.save(sc, "crawl_lda_model_10")
crawl_lda_model_10_topic = LDAModel.load(sc, "crawl_lda_model_10")
crawl_lda_model_5_topic = LDAModel.load(sc, "crawl_lda_model")

In [23]:
crawl_lda_model_10_topic.topicsMatrix()
# topicIndices.foreach { case (terms, termWeights) =>
#   println("TOPIC:")
#   terms.zip(termWeights).foreach { case (term, weight) =>
#     println(s"${vocabArray(term.toInt)}\t$weight")
#   }
#   println()
# }

array([[  67395.48957972,   85050.97410262,   63557.40660837, ...,
          93898.32436502,  107965.64700228,   49359.36430729],
       [  46397.32082255,   56531.99588281,   40777.25830006, ...,
          58800.33322591,   68970.32486657,   32652.64896381],
       [  55926.39047845,   52929.59742882,   41273.53191615, ...,
          55989.97140447,   65007.11185714,   34367.10368376],
       ..., 
       [    464.53081807,     575.39336184,     395.06668204, ...,
            553.8007751 ,     569.81278027,     358.68164792],
       [    471.075264  ,     579.36657261,     408.82284165, ...,
            507.79039554,     545.47801871,     350.16652695],
       [   1414.3327473 ,    1101.3082602 ,     557.47578519, ...,
            211.35386986,     252.48829074,     147.04003941]])

In [25]:
topic_indices_mllib = crawl_lda_model_10_topic.describeTopics(maxTermsPerTopic=100)

In [24]:
#opic_indices_mllib[0]

## For ml
### pyspark.mllib: pyspark.rdd.RDD
### pyspark.ml: pyspark.sql.DataFrame

In [24]:
#num_topic_list = [5, 10, 20, 40, 60, 80, 100, 120, 140, 160]
lda_5 = LDA(k=5, seed=1)
lda_10 = LDA(k=10, seed=1)
lda_20 = LDA(k=20, seed=1)
lda_40 = LDA(k=40, seed=1)
lda_60 = LDA(k=60, seed=1)
lda_80 = LDA(k=80, seed=1)
lda_100 = LDA(k=100, seed=1)
lda_120 = LDA(k=120, seed=1)
lda_140 = LDA(k=140, seed=1)
lda_160 = LDA(k=160, seed=1)
lda_180 = LDA(k=180, seed=1)
lda_200 = LDA(k=200, seed=1)

In [26]:
crawl_ml_lda_model_10_topics = dlda.fit(rec_df)

In [20]:
crawl_ml_lda_model_10_topics = LocalLDAModel.load(model_path)

In [21]:
model_path = "crawl_ml_lda_model_10_topics_without_stopwords"
#crawl_ml_lda_model_10_topics.save(model_path)
# if crawl_ml_lda_model_10_topics.isDistributed():
#     crawl_ml_lda_model_10_topics = DistributedLDAModel.load(model_path)
# else:
#     crawl_ml_lda_model_10_topics = LocalLDAModel.load(model_path)

In [23]:
#crawl_ml_lda_model_10_topics.save("crawl_ml_lda_model_10_topics")
crawl_ml_lda_model_10_topics = LocalLDAModel.load("crawl_ml_lda_model_10_topics")

In [27]:
avg_log_likelihood = crawl_ml_lda_model_10_topics.logLikelihood(rec_df) / rec_df.count()

In [None]:
LocalLDAModel.

In [28]:
avg_log_likelihood

-1527.5392176344706

In [29]:
crawl_ml_lda_model_10_topics.topicsMatrix()

DenseMatrix(1000, 10, [236606.2612, 110506.9378, 110057.927, 10687.7156, 113694.8787, 92914.5845, 73856.0682, 6687.9971, ..., 0.6679, 91.7154, 98.6573, 514.8735, 711.011, 540.6104, 603.2262, 19.3826], 0)

In [30]:
print("Learned topics (as distributions over vocab of " + str(crawl_ml_lda_model_10_topics.vocabSize())
      + " words):")
topics = crawl_ml_lda_model_10_topics.topicsMatrix()

Learned topics (as distributions over vocab of 1000 words):


In [31]:
# for topic in range(10):
#     print("Topic " + str(topic) + ":")
#     for word in range(0, crawl_ml_lda_model_10_topics.vocabSize()):
#         print ("{}, {}".format(topic, word))
# #        print(" " + str(topics[word][topic]))

In [32]:
topics.toArray().T.shape

(10, 1000)

In [30]:
topic_indices = crawl_ml_lda_model_10_topics.describeTopics(maxTermsPerTopic=100)
term_indices = topic_indices.select("termIndices").rdd.flatMap(lambda x: x).collect()
term_weights = topic_indices.select("termWeights").rdd.flatMap(lambda x: x).collect()
vocab_indices = [[vocab_list[idx] for idx in term_idx] for term_idx in term_indices]

In [41]:
topic_indices.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[0, 4, 1, 2, 5, 6...|[0.09166286711625...|
|    1|[8, 5, 32, 60, 37...|[0.13337092125148...|
|    2|[3, 21, 27, 79, 1...|[0.18970561146114...|
|    3|[2, 38, 93, 4, 0,...|[0.05933315974317...|
|    4|[8, 3, 48, 91, 85...|[0.07092225907509...|
|    5|[24, 33, 55, 49, ...|[0.11553424413929...|
|    6|[7, 19, 76, 3, 13...|[0.10452884112717...|
|    7|[13, 5, 22, 3, 15...|[0.02502876337396...|
|    8|[11, 0, 2, 6, 1, ...|[0.04967629447772...|
|    9|[1, 0, 2, 4, 7, 9...|[0.03734072257673...|
+-----+--------------------+--------------------+



In [31]:
vocab_indices

[[u'-',
  u'(1)',
  u'&',
  u'de',
  u'1',
  u'\xf0\xb8',
  u'la',
  u'new',
  u'en',
  u'2',
  u'home',
  u'us',
  u'4',
  u'\xe2\x80\x93',
  u'0',
  u':',
  u'one',
  u'2016',
  u'+',
  u'\xe2\xbb',
  u'pm',
  u'\xf1\x81',
  u'e',
  u'top',
  u'search',
  u'accessories',
  u'\xe2\xa9',
  u'et',
  u'...',
  u'3',
  u'(3)',
  u'un',
  u'would',
  u'\xf0\xb4\xf0\xbb\xf1\x8f',
  u'shop',
  u'help',
  u'el',
  u'|',
  u'page',
  u'10',
  u'9',
  u'{',
  u'terms',
  u'2014',
  u'contact',
  u'december',
  u'november',
  u'see',
  u'per',
  u'w',
  u'first',
  u'11',
  u'add',
  u'full',
  u'black',
  u'2010',
  u'years',
  u'/',
  u'\xf0\x92',
  u'care',
  u'il',
  u'part',
  u'le',
  u'\xe3\xa0',
  u'time',
  u'member',
  u'use',
  u'business',
  u'comments',
  u'want',
  u'hotel',
  u'day',
  u'}',
  u'website',
  u'january',
  u'30',
  u'public',
  u'rights',
  u'que',
  u'\xf0\xbf\xf0\xbe',
  u'\xf0\xb2',
  u'two',
  u'sale',
  u'\xf0\xb7\xf0\xb0',
  u'var',
  u'training',
  u'list',
 

In [29]:
# Print topics, showing top-weighted 10 terms for each topic.
vocab_list = cv_model.vocabulary

for topic_no in xrange(10):
    print (topic_no, {vocab_indices[topic_no][i]: term_weights[topic_no][i] for i in range(100)})


NameError: global name 'vocab_indices' is not defined

### Distributed LDA

In [44]:
transformed = crawl_ml_lda_model_10_topics.transform(rec_df)

In [45]:
transformed.printSchema()

root
 |-- contents: string (nullable = true)
 |-- url: string (nullable = true)
 |-- website: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)



In [64]:
doc_topics = transformed.select("topicDistribution").collect()

In [62]:
doc_topics.count()

TypeError: count() takes exactly one argument (0 given)

In [53]:
doc_topics_arr = np.array(doc_topics)

In [None]:
doc_topics_arr

In [48]:
type(doc_topics)

list

In [51]:
doc_topics_df = transformed.select("topicDistribution")

In [50]:
type(doc_topics)

pyspark.sql.dataframe.DataFrame

In [54]:
doc_topics_df.coalesce(1).write.save("doc_topics_10.csv")

In [55]:
pwd

u'/home/hadoop/DSI_Project/scripts'

In [56]:
dt_topics_10 = sc.textFile("doc_topics_10.csv")

In [59]:
dt_topics_10.count()

25567

In [None]:
distributed_crawl_ml_lda_model_10_topics = DistributedLDAModel