In [1]:
import os
import pandas as pd
import numpy as np
import warc
import gzip
import boto
from boto.s3.key import Key
from gzipstream import GzipStreamFile
from mrjob.job import MRJob
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from __future__ import print_function
from pyspark.ml import Pipeline
from pyspark.mllib.clustering import LDAModel#, LDA
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.clustering import LDA, DistributedLDAModel, LocalLDAModel
from pyspark.ml.feature import CountVectorizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import lit

In [2]:
sc = ps.SparkContext('local[4]')
sqlContext = ps.SQLContext(sc)

In [3]:
ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
DSI_CSTON_BUCKET = "galvanize.dsi.capstone.alex"
OUT_BUCKET = "s3a://{}:{}@{}".format(ACCESS_KEY, SECRET_KEY, DSI_CSTON_BUCKET)

In [4]:
wet_path_file_name = 'wet_2016_list/2016-Dec-wet.path'
no_of_doc_files = 1
dec_2016_wet_list = sc.textFile("s3a://%s/%s" % (DSI_CSTON_BUCKET, wet_path_file_name))
doc_files = dec_2016_wet_list.take(no_of_doc_files)

In [5]:
conn = boto.connect_s3(ACCESS_KEY, SECRET_KEY, host='s3.amazonaws.com')
pds = conn.get_bucket('commoncrawl')

In [6]:
data = []
url = []

for file in doc_files:
    k = Key(pds, file)
    f = warc.WARCFile(fileobj=GzipStreamFile(k))

    for i, document in enumerate(f):
        if document['Content-Type'] != 'text/plain':
            continue
        
        dat = document.payload.read()
        data.append(str(dat))
        url.append(str(document.url))

In [7]:
rec_df = pd.DataFrame(
    {'contents': data,
     'url': url
    })
rec_df = sqlContext.createDataFrame(rec_df)

In [8]:
tokenizer = RegexTokenizer(inputCol="contents", outputCol="words")
rec_df = tokenizer.transform(rec_df)

In [9]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
rec_df = remover.transform(rec_df)

In [10]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")

In [11]:
rec_df = hashingTF.transform(rec_df)

In [12]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [13]:
tfidfModel = idf.fit(rec_df)

In [14]:
rec_df = tfidfModel.transform(rec_df)

In [15]:
rec_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|               words|            filtered|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|[çç, çç, ...|[çç, çç, ...|(262144,[1837,770...|(262144,[1837,770...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|[æ¿æ©ç´ é£â§ç...|[æ¿æ©ç´ é£â§ç...|(262144,[10267,17...|(262144,[10267,17...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|[ðð¾ð¿ñð¾ññ, ...|[ðð¾ð¿ñð¾ññ, ...|(262144,[1438,203...|(262144,[1438,203...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|[ð, ð²ð¸ð±ñð°ñ...|[ð, ð²ð¸ð±ñð°ñ...|(262144,[310,1325...|(262144,[310,1325...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|[ððµñðµð»ð¾ð¼, ...|[ððµñðµð»ð¾ð¼, ...|(262144,[310,1325..

In [26]:
rec_df.select("features").collect()

[Row(features=SparseVector(262144, {1837: 7.3191, 7706: 7.5354, 10625: 6.8083, 14623: 7.5354, 23560: 5.3812, 33953: 8.0123, 44473: 7.4062, 51598: 6.9708, 52296: 7.0525, 54961: 1.8405, 61285: 6.9906, 65569: 7.2917, 81948: 3.7606, 84509: 7.4369, 89219: 6.5332, 98778: 4.6185, 107290: 7.4062, 113458: 0.6991, 118631: 7.1891, 119044: 7.8581, 119708: 6.5332, 139477: 7.1185, 143550: 7.8581, 153116: 8.0695, 153727: 4.2879, 158186: 6.5719, 158425: 2.0742, 167222: 5.9783, 175912: 5.342, 176235: 38.6231, 179505: 6.9906, 188105: 6.5206, 197123: 7.6838, 197274: 6.546, 199407: 8.1946, 204762: 7.4369, 210566: 5.6662, 214294: 7.6838, 216159: 7.3191, 225541: 7.8116, 227334: 8.0695, 232934: 7.5354, 255885: 3.4863, 259831: 7.2391})),
 Row(features=SparseVector(262144, {10267: 7.5015, 17584: 7.0108, 75196: 7.7246, 82106: 7.074, 89110: 7.6446, 100602: 7.0108, 113458: 0.6991, 122969: 1.3127, 229557: 4.825, 231824: 4.7001, 244713: 5.4137, 248974: 7.6446, 258565: 5.4866})),
 Row(features=SparseVector(262144, {