In [1]:
from pipeline import Pipeline
import csv
from pipeline import build_csv
import json
import io

import string
import re

from functools import partial

import datetime

from stop_words import stop_words

import warnings

warnings.filterwarnings('ignore')

In [2]:
pipeline=Pipeline()

## Goal is to filter out stories that are most Popular and do a word count of the most popular words used:
* Criteria: 50 points, more than 1 comment, and titles that do not begin with Ask HN

In [3]:
@pipeline.task()
def file_to_json():
    
    json_file=open('hn_stories_2014.json')
    
    output=json.load(json_file)
    
    return output['stories']

@pipeline.task(depends_on=file_to_json)
def filter_stories(li):
    
    def inner(date):
        return datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%SZ')
    
    generator=((a['title'],a['objectID'],inner(a['created_at']),a['url'],a['points']) for a in li if ((a['title']!='Ask HN')&(a['points']>=50)&(a['num_comments']>1)))
    
    
    return generator

@pipeline.task(depends_on=filter_stories)
def json_to_csv(filtered_dat):
    output=build_csv(filtered_dat,header=['title','objectID', 'created_at', 'url', 'points'],file=io.StringIO())
    return output


@pipeline.task(depends_on=json_to_csv)
def extract_titles(output):
    
    titles_gen=(a.split(',')[0] for a in output)
    
    return titles_gen

@pipeline.task(depends_on=extract_titles)
def clean_titles(input_titles):
    
    punct=string.punctuation
    punct=punct+'‘'+'’'
    re_exp='['+punct+']'
    
    clean_gen=(re.sub(re_exp,' ',a.lower()) for a in input_titles)

    return clean_gen

@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(cleaned_titles_gen):
    
    partial_func=partial(lambda x: re.split(' *',x.strip()))
    
    
    keyword_dic={}
    
    def inner(x):
        if x!='title':
            li=partial_func(x)

            for x in li:
                if x not in stop_words:
                    if x not in keyword_dic.keys():
                        keyword_dic[x]=1
                    else:
                        keyword_dic[x]+=1

        return keyword_dic
    
    gen=(inner(title) for title in cleaned_titles_gen)
    
    for dic in gen:
        
        try:
            final_dic=dic
            
        except Exception as e:
            pass
        
    return  final_dic

@pipeline.task(depends_on=build_keyword_dictionary)
def sorting_dic(final_dic):
   
    final_dic1=sorted(final_dic.items(), key=lambda kv:kv[1],reverse=True)  
    
    return final_dic1


In [4]:
pipeline_output=pipeline.run()

In [5]:
print(pipeline_output.keys())
function_li=list(pipeline_output.keys())

dict_keys([<function file_to_json at 0x000001ACC7581F28>, <function filter_stories at 0x000001ACC7581EA0>, <function json_to_csv at 0x000001ACC7596048>, <function extract_titles at 0x000001ACC75960D0>, <function clean_titles at 0x000001ACC7596158>, <function build_keyword_dictionary at 0x000001ACC75961E0>, <function sorting_dic at 0x000001ACC7596268>])


### Prints out the sorted word count dictionary from the most used words to the least used

In [6]:
pipeline_output[function_li[6]]

[('s', 561),
 ('google', 191),
 ('new', 168),
 ('1', 143),
 ('t', 136),
 ('ask', 129),
 ('open', 116),
 ('bitcoin', 101),
 ('0', 99),
 ('web', 97),
 ('2', 92),
 ('programming', 91),
 ('3', 82),
 ('source', 82),
 ('data', 78),
 ('facebook', 77),
 ('video', 76),
 ('free', 75),
 ('c', 75),
 ('python', 72),
 ('using', 71),
 ('released', 71),
 ('2014', 69),
 ('js', 69),
 ('app', 68),
 ('code', 68),
 ('time', 67),
 ('internet', 64),
 ('world', 63),
 ('2013', 62),
 ('apple', 58),
 ('game', 58),
 ('javascript', 57),
 ('work', 56),
 ('linux', 56),
 ('microsoft', 55),
 ('pdf', 53),
 ('don', 53),
 ('use', 53),
 ('startup', 53),
 ('software', 52),
 ('language', 50),
 ('yc', 49),
 ('make', 48),
 ('like', 47),
 ('security', 47),
 ('4', 45),
 ('github', 45),
 ('year', 44),
 ('way', 42),
 ('nsa', 42),
 ('heartbleed', 41),
 ('project', 40),
 ('windows', 40),
 ('git', 39),
 ('developer', 38),
 ('gox', 38),
 ('ios', 38),
 ('man', 37),
 ('u', 37),
 ('amazon', 37),
 ('design', 37),
 ('mt', 37),
 ('computer