In [1]:
import os
import pandas as pd
import numpy as np
import warc
import gzip
import boto
from boto.s3.key import Key
from gzipstream import GzipStreamFile
from mrjob.job import MRJob
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from __future__ import print_function
from pyspark.ml import Pipeline
from pyspark.mllib.clustering import LDAModel#, LDA
from pyspark.mllib.linalg import Vectors, DenseVector, SparseVector
from pyspark.ml.clustering import LDA, DistributedLDAModel, LocalLDAModel
from pyspark.ml.feature import CountVectorizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import lit
from sklearn.metrics.pairwise import linear_kernel

In [2]:
sc = ps.SparkContext('local[4]')
sqlContext = ps.SQLContext(sc)

In [3]:
ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
SECRET_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
DSI_CSTON_BUCKET = "galvanize.dsi.capstone.alex"
OUT_BUCKET = "s3a://{}:{}@{}".format(ACCESS_KEY, SECRET_KEY, DSI_CSTON_BUCKET)

In [4]:
wet_path_file_name = 'wet_2016_list/2016-Dec-wet.path'
no_of_doc_files = 1
dec_2016_wet_list = sc.textFile("s3a://%s/%s" % (DSI_CSTON_BUCKET, wet_path_file_name))
doc_files = dec_2016_wet_list.take(no_of_doc_files)

In [5]:
conn = boto.connect_s3(ACCESS_KEY, SECRET_KEY, host='s3.amazonaws.com')
pds = conn.get_bucket('commoncrawl')

In [6]:
data = []
url = []

for file in doc_files:
    k = Key(pds, file)
    f = warc.WARCFile(fileobj=GzipStreamFile(k))

    for i, document in enumerate(f):
        if document['Content-Type'] != 'text/plain':
            continue
        
        dat = document.payload.read()
        data.append(str(dat))
        url.append(str(document.url))

In [7]:
rec_df = pd.DataFrame(
    {'contents': data,
     'url': url
    })
rec_df = sqlContext.createDataFrame(rec_df)

In [8]:
tokenizer = RegexTokenizer(inputCol="contents", outputCol="words")
rec_df = tokenizer.transform(rec_df)

In [9]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
rec_df = remover.transform(rec_df)

In [10]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000 )

In [11]:
rec_df = hashingTF.transform(rec_df)

In [12]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [13]:
tfidfModel = idf.fit(rec_df)

In [14]:
rec_df = tfidfModel.transform(rec_df)

In [15]:
rec_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            contents|                 url|               words|            filtered|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|çç
çç	ç...|http://007zhenren...|[çç, çç, ...|[çç, çç, ...|(1000,[31,57,100,...|(1000,[31,57,100,...|
|æ¿æ©ç´ é£â§ç¶...|http://0289646723...|[æ¿æ©ç´ é£â§ç...|[æ¿æ©ç´ é£â§ç...|(1000,[74,97,252,...|(1000,[74,97,252,...|
|ÐÐ¾Ð¿ÑÐ¾ÑÑ Ð¿...|http://03online.c...|[ðð¾ð¿ñð¾ññ, ...|[ðð¾ð¿ñð¾ññ, ...|(1000,[0,1,3,7,8,...|(1000,[0,1,3,7,8,...|
|Ð Ð²Ð¸Ð±ÑÐ°ÑÐ¸...|http://03online.c...|[ð, ð²ð¸ð±ñð°ñ...|[ð, ð²ð¸ð±ñð°ñ...|(1000,[4,12,13,19...|(1000,[4,12,13,19...|
|ÐÐµÑÐµÐ»Ð¾Ð¼ Ð³...|http://03online.c...|[ððµñðµð»ð¾ð¼, ...|[ððµñðµð»ð¾ð¼, ...|(1000,[0,7,12,13,..

In [16]:
tfidf_list = rec_df.select("features").collect()

In [17]:
crawl_ml_lda_models = {}
crawl_ml_lda_models[5] = LocalLDAModel.load("crawl_ml_lda_model_5_topics")
crawl_ml_lda_models[10] = LocalLDAModel.load("crawl_ml_lda_model_10_topics")
crawl_ml_lda_models[20] = LocalLDAModel.load("crawl_ml_lda_model_20_topics")
crawl_ml_lda_models[40] = LocalLDAModel.load("crawl_ml_lda_model_40_topics")
crawl_ml_lda_models[60] = LocalLDAModel.load("crawl_ml_lda_model_60_topics")
crawl_ml_lda_models[80] = LocalLDAModel.load("crawl_ml_lda_model_80_topics")
crawl_ml_lda_models[100] = LocalLDAModel.load("crawl_ml_lda_model_100_topics")
crawl_ml_lda_models[120] = LocalLDAModel.load("crawl_ml_lda_model_120_topics")
crawl_ml_lda_models[140] = LocalLDAModel.load("crawl_ml_lda_model_140_topics")
crawl_ml_lda_models[160] = LocalLDAModel.load("crawl_ml_lda_model_160_topics")
crawl_ml_lda_models[180] = LocalLDAModel.load("crawl_ml_lda_model_180_topics")
crawl_ml_lda_models[200] = LocalLDAModel.load("crawl_ml_lda_model_200_topics")

In [18]:
doc_topics_20 = np.load("doc_topics_20.csv.npy")

In [19]:
doc_topics_20.shape

(54320, 1, 20)

In [20]:
topic_per_docs = doc_topics_20.T

In [21]:
topic_per_docs.shape

(20, 1, 54320)

In [22]:
arr = np.argsort(topic_per_docs)

In [23]:
arr

array([[[42702, 41091,  2271, ..., 52081, 32588, 36433]],

       [[12238,  1235, 40142, ..., 44470, 44471, 44468]],

       [[28447, 12719, 43292, ..., 38138,  5196, 25336]],

       ..., 
       [[ 8559, 10418, 10419, ..., 50497, 50499,  4649]],

       [[27163, 28440,  1845, ..., 10523, 10521, 10522]],

       [[42688, 33404,  1235, ..., 24963, 11938, 44572]]])

In [24]:
filtered_docs_per_topics = {}
for i in xrange(doc_topics_20.shape[2]):
    filtered_docs_per_topics[i] = arr[i][0][::-1][:4999]

In [None]:
filtered_docs_per_topics

{0: array([36433, 32588, 52081, ..., 21810, 14938, 53787]),
 1: array([44468, 44471, 44470, ..., 50631, 51334,   782]),
 2: array([25336,  5196, 38138, ..., 25525, 30862, 22870]),
 3: array([43132, 16957, 41534, ..., 42295,  8117,  5561]),
 4: array([ 9912, 52434, 47177, ...,    68, 51440, 14730]),
 5: array([37241, 28802, 28803, ..., 49473, 13063, 32207]),
 6: array([22423, 38593, 30008, ...,   700,  8329,  1465]),
 7: array([51863, 15374, 32438, ...,  4712, 17241, 34350]),
 8: array([ 3908, 12872, 33063, ...,  9263,  9269,  9266]),
 9: array([ 5021, 45754,  4573, ..., 45598, 20698, 36554]),
 10: array([  389, 24800,  1474, ..., 53631,  9177,  9176]),
 11: array([51662, 28378,  4478, ..., 53212, 13753,  7145]),
 12: array([14321,  1668, 14319, ..., 31926,  4434, 31957]),
 13: array([23588, 47189, 47190, ..., 34873, 17794, 32956]),
 14: array([32578, 29341, 29337, ..., 16105,  4561,  4562]),
 15: array([ 9707,  9741, 47576, ..., 12416,  1053, 48697]),
 16: array([ 3927, 22245, 50469, .

In [114]:
tfidf_list

[Row(features=SparseVector(1000, {31: 1.3101, 57: 1.2363, 100: 1.2983, 102: 1.4073, 224: 1.0785, 263: 1.1017, 273: 0.9448, 317: 1.3672, 324: 1.3468, 330: 1.0557, 389: 1.2819, 399: 1.1157, 444: 1.4642, 445: 1.3044, 450: 1.3465, 470: 1.8618, 487: 1.4681, 496: 1.3367, 502: 1.2286, 513: 1.2021, 517: 1.1823, 525: 0.7862, 562: 0.5035, 569: 1.397, 577: 0.9447, 602: 1.2892, 676: 1.3092, 721: 0.9336, 738: 1.3985, 741: 1.285, 755: 1.0616, 779: 1.0483, 787: 6.0612, 798: 1.1975, 809: 1.176, 854: 1.4848, 864: 1.4719, 866: 1.3298, 870: 2.4982, 882: 0.9272, 913: 1.4791, 919: 1.0991, 959: 0.8842})),
 Row(features=SparseVector(1000, {74: 1.4187, 97: 1.4086, 252: 1.3403, 314: 1.4557, 534: 1.2454, 562: 0.5035, 685: 1.0213, 752: 1.3223, 795: 1.3993, 817: 0.6919, 886: 1.1509, 920: 1.3469, 941: 1.2537})),
 Row(features=SparseVector(1000, {0: 0.8032, 1: 1.2011, 3: 0.7165, 7: 1.2882, 8: 2.4537, 9: 2.8042, 12: 1.204, 13: 1.274, 14: 1.6038, 21: 1.359, 23: 1.12, 29: 2.8081, 30: 1.3657, 31: 2.6201, 32: 2.5234, 34

In [None]:
tfidf_list_arr = np.array(tfidf_list)

In [None]:
list(filtered_docs_per_topics[0])