In [109]:
import pandas as pd
import numpy as np
import json

This mini-projects is using Hacker News content. Unlike the previous ones, I use the json format here.

The columns are:
- author: The username of the person who submitted the story.
- createdAt: The date and time at which the story was created.
- createdAtI: An integer value representing the date and time at which the story was created.
- numComments: The number of comments that were made on the story.
- objectId: The unique identifier from Hacker News for the story.
- points: The number of points the story acquired, calculated as the total number of upvotes minus the total number of downvotes.
- storyText: The text of the story (if the story contains text).
- tags: A list of tags associated with the story.
- title: The title of the story.
- url: The URL that the story links to (if the story links to a URL).

In [110]:
file = open('hn_2014.json')
hn = json.load(file)

In [111]:
hn[0:2]

[{'author': 'dragongraphics',
  'numComments': 0,
  'points': 2,
  'url': 'http://ashleynolan.co.uk/blog/are-we-getting-too-sassy',
  'storyText': '',
  'createdAt': '2014-05-29T08:07:50Z',
  'tags': ['story', 'author_dragongraphics', 'story_7815238'],
  'createdAtI': 1401350870,
  'title': 'Are we getting too Sassy? Weighing up micro-optimisation vs. maintainability',
  'objectId': '7815238'},
 {'author': 'jcr',
  'numComments': 0,
  'points': 1,
  'url': 'http://spectrum.ieee.org/automaton/robotics/home-robots/telemba-telepresence-robot',
  'storyText': '',
  'createdAt': '2014-05-29T08:05:58Z',
  'tags': ['story', 'author_jcr', 'story_7815234'],
  'createdAtI': 1401350758,
  'title': 'Telemba Turns Your Old Roomba and Tablet Into a Telepresence Robot',
  'objectId': '7815234'}]

In [112]:
type(hn)

list

In [113]:
len(hn)

35806

In [114]:
type(hn[0])

dict

In [115]:
hn[0].keys()

dict_keys(['author', 'numComments', 'points', 'url', 'storyText', 'createdAt', 'tags', 'createdAtI', 'title', 'objectId'])

#### json.dumps()

formatting to make it easier to read

In [116]:
def jprint(obj):
    text = json.dumps(obj, sort_keys=True, indent=4)
    print(text)

In [117]:
jprint(hn[0])

{
    "author": "dragongraphics",
    "createdAt": "2014-05-29T08:07:50Z",
    "createdAtI": 1401350870,
    "numComments": 0,
    "objectId": "7815238",
    "points": 2,
    "storyText": "",
    "tags": [
        "story",
        "author_dragongraphics",
        "story_7815238"
    ],
    "title": "Are we getting too Sassy? Weighing up micro-optimisation vs. maintainability",
    "url": "http://ashleynolan.co.uk/blog/are-we-getting-too-sassy"
}


Two different formats of data:
- "createdAt": "2014-05-29T08:07:50Z",
- "createdAtI": 1401350870,
<br>for the purpose of this excercise I'm rmoving this one

In [118]:
def del_key(dict_, key):
    modify_dict = dict_.copy()
    del modify_dict[key]
    return modify_dict

In [119]:
first_story = hn[0]
first_story = del_key(first_story, 'createdAtI')
first_story

{'author': 'dragongraphics',
 'numComments': 0,
 'points': 2,
 'url': 'http://ashleynolan.co.uk/blog/are-we-getting-too-sassy',
 'storyText': '',
 'createdAt': '2014-05-29T08:07:50Z',
 'tags': ['story', 'author_dragongraphics', 'story_7815238'],
 'title': 'Are we getting too Sassy? Weighing up micro-optimisation vs. maintainability',
 'objectId': '7815238'}

In [120]:
jprint(first_story)

{
    "author": "dragongraphics",
    "createdAt": "2014-05-29T08:07:50Z",
    "numComments": 0,
    "objectId": "7815238",
    "points": 2,
    "storyText": "",
    "tags": [
        "story",
        "author_dragongraphics",
        "story_7815238"
    ],
    "title": "Are we getting too Sassy? Weighing up micro-optimisation vs. maintainability",
    "url": "http://ashleynolan.co.uk/blog/are-we-getting-too-sassy"
}


##### Append the cleaned dictionary

In [121]:
hn_clean = []

for hn_dict in hn:
    hn_dict = del_key(hn_dict, 'createdAtI')
    hn_clean.append(hn_dict)

In [122]:
hn_clean[0:2]

[{'author': 'dragongraphics',
  'numComments': 0,
  'points': 2,
  'url': 'http://ashleynolan.co.uk/blog/are-we-getting-too-sassy',
  'storyText': '',
  'createdAt': '2014-05-29T08:07:50Z',
  'tags': ['story', 'author_dragongraphics', 'story_7815238'],
  'title': 'Are we getting too Sassy? Weighing up micro-optimisation vs. maintainability',
  'objectId': '7815238'},
 {'author': 'jcr',
  'numComments': 0,
  'points': 1,
  'url': 'http://spectrum.ieee.org/automaton/robotics/home-robots/telemba-telepresence-robot',
  'storyText': '',
  'createdAt': '2014-05-29T08:05:58Z',
  'tags': ['story', 'author_jcr', 'story_7815234'],
  'title': 'Telemba Turns Your Old Roomba and Tablet Into a Telepresence Robot',
  'objectId': '7815234'}]

#### List Comprehensions

Practising transformation from loops to comprehension lists

In [123]:
ints = [1,2,3,4]

In [124]:
plus_one = []

for i in ints:
    plus_one.append(i + 1)

plus_one

[2, 3, 4, 5]

In [125]:
times_ten = []

for i in ints:
    times_ten.append(i * 10)

times_ten

[10, 20, 30, 40]

In [126]:
times_ten_cl = [i * 10 for i in ints]

times_ten_cl

[10, 20, 30, 40]

In [127]:
floats = [2.1, 8.7, 4.2, 8.9]

In [128]:
rounded = []

for f in floats:
    rounded.append(round(f))

rounded

[2, 9, 4, 9]

In [129]:
rounded_cl = [round(f) for f in floats]
rounded_cl

[2, 9, 4, 9]

In [130]:
letters = ['a', 'b', 'c', 'd']

In [131]:
caps = []

for l in letters:
    caps.append(l.upper())
    
caps    

['A', 'B', 'C', 'D']

In [132]:
caps_cl = [l.upper() for l in letters]
caps_cl

['A', 'B', 'C', 'D']

##### Comprehension representation of the del loop

In [133]:
hn_clean = [del_key(h, 'createdAtI') for h in hn]
hn_clean[0:2]

[{'author': 'dragongraphics',
  'numComments': 0,
  'points': 2,
  'url': 'http://ashleynolan.co.uk/blog/are-we-getting-too-sassy',
  'storyText': '',
  'createdAt': '2014-05-29T08:07:50Z',
  'tags': ['story', 'author_dragongraphics', 'story_7815238'],
  'title': 'Are we getting too Sassy? Weighing up micro-optimisation vs. maintainability',
  'objectId': '7815238'},
 {'author': 'jcr',
  'numComments': 0,
  'points': 1,
  'url': 'http://spectrum.ieee.org/automaton/robotics/home-robots/telemba-telepresence-robot',
  'storyText': '',
  'createdAt': '2014-05-29T08:05:58Z',
  'tags': ['story', 'author_jcr', 'story_7815234'],
  'title': 'Telemba Turns Your Old Roomba and Tablet Into a Telepresence Robot',
  'objectId': '7815234'}]

#### Transforming a list with list comprehensions

In [134]:
squares = [1,4,9,16,25,36,49]

In [135]:
sqroots = []

for sq in squares:
    sqroots.append(sq ** (1/2))
    
sqroots    

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]

In [136]:
sqroots_cl = [sq ** (1/2) for sq in squares]
sqroots_cl

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]

#### Creating a new list with list comprehensions

In [137]:
cols = []
for i in range(1,5):
    cols.append('col_{}'.format(i))

cols    

['col_1', 'col_2', 'col_3', 'col_4']

In [138]:
cols_cl = ['col_{}'.format(i) for i in range(1,5)]
cols_cl

['col_1', 'col_2', 'col_3', 'col_4']

create an empty dataframe with labels

In [139]:
cols_cl = ['col_{}'.format(i) for i in range(1,5)]
data = np.zeros((4,4))

df = pd.DataFrame(data, cols_cl)
df

Unnamed: 0,0,1,2,3
col_1,0.0,0.0,0.0,0.0
col_2,0.0,0.0,0.0,0.0
col_3,0.0,0.0,0.0,0.0
col_4,0.0,0.0,0.0,0.0


#### Extract the url value from each dictionary

In [140]:
urls_cl = [h['url'] for h in hn_clean]
urls_cl[0:2]

['http://ashleynolan.co.uk/blog/are-we-getting-too-sassy',
 'http://spectrum.ieee.org/automaton/robotics/home-robots/telemba-telepresence-robot']

#### Reducing a list with comprehension list

##### Remove any integers that were smaller than 50

In [141]:
ints = [25, 34, 28, 98, 826, 21, 2]

In [142]:
big_ints = []

for i in ints:
    if i > 50:
        big_ints.append(i)
    
big_ints    

[98, 826]

In [143]:
big_ints_cl = [i for i in ints if i > 50]
big_ints_cl

[98, 826]

##### Count the number of stories that have comments

In [144]:
hn_clean[0]

{'author': 'dragongraphics',
 'numComments': 0,
 'points': 2,
 'url': 'http://ashleynolan.co.uk/blog/are-we-getting-too-sassy',
 'storyText': '',
 'createdAt': '2014-05-29T08:07:50Z',
 'tags': ['story', 'author_dragongraphics', 'story_7815238'],
 'title': 'Are we getting too Sassy? Weighing up micro-optimisation vs. maintainability',
 'objectId': '7815238'}

In [145]:
have_comments = []

for h in hn_clean:
    if h['numComments'] != 0:
        have_comments.append(h)
        
have_comments[2]

{'author': 'mr_tyzic',
 'numComments': 27,
 'points': 161,
 'url': 'http://projects.aljazeera.com/2014/portrait-of-down-syndrome/index.html',
 'storyText': '',
 'createdAt': '2014-05-29T03:51:01Z',
 'tags': ['story', 'author_mr_tyzic', 'story_7814608'],
 'title': 'For Hire: Dedicated Young Man With Down Syndrome',
 'objectId': '7814608'}

In [146]:
len(have_comments)

9279

In [147]:
have_comments_cl = [h for h in hn_clean if h['numComments'] != 0]
have_comments_cl[2]

{'author': 'mr_tyzic',
 'numComments': 27,
 'points': 161,
 'url': 'http://projects.aljazeera.com/2014/portrait-of-down-syndrome/index.html',
 'storyText': '',
 'createdAt': '2014-05-29T03:51:01Z',
 'tags': ['story', 'author_mr_tyzic', 'story_7814608'],
 'title': 'For Hire: Dedicated Young Man With Down Syndrome',
 'objectId': '7814608'}

In [148]:
len(have_comments_cl)

9279

##### List with values from hn_clean where the points key has a value greater than 1000

In [149]:
th_points = [h for h in hn_clean if h['points'] > 1000]
th_points[2]

{'author': 'frederfred',
 'numComments': 398,
 'points': 2732,
 'url': 'http://gabrielecirulli.github.io/2048/',
 'storyText': '',
 'createdAt': '2014-03-10T15:44:42Z',
 'tags': ['story', 'author_frederfred', 'story_7373566'],
 'title': '2048',
 'objectId': '7373566'}

In [150]:
len(th_points)

8

#### Using min(), max(), sorted() with json

In [151]:
jprint(hn[2])

{
    "author": "callum85",
    "createdAt": "2014-05-29T08:05:06Z",
    "createdAtI": 1401350706,
    "numComments": 0,
    "objectId": "7815230",
    "points": 1,
    "storyText": "",
    "tags": [
        "story",
        "author_callum85",
        "story_7815230"
    ],
    "title": "Apple Agrees to Buy Beats for $3 Billion",
    "url": "http://online.wsj.com/articles/apple-to-buy-beats-1401308971"
}


#### Treat a function as variable

In [152]:
def greet():
    return 'hello'
    
greet()    

'hello'

In [153]:
t = type(greet())
print(t)

<class 'str'>


In [154]:
t = type(greet)
print(t)

<class 'function'>


In [155]:
greet2 = greet
greet2

<function __main__.greet()>

In [156]:
greet2()

'hello'

#### Run a function inside another function by passing it as an argument

In [157]:
def run_func(func):
    print('running function: {}'.format(func))
    return func()

In [158]:
run_func(greet)

running function: <function greet at 0x000000000962D730>


'hello'

#### Find the story that has the greatest number of comments

In [159]:
def num_of_comments(sd):
    return sd['numComments']

In [160]:
num_of_comments(hn_clean[11])

1

In [161]:
max_comments = max(hn_clean, key=num_of_comments)
max_comments

{'author': 'platz',
 'numComments': 1208,
 'points': 889,
 'url': 'https://blog.mozilla.org/blog/2014/04/03/brendan-eich-steps-down-as-mozilla-ceo/',
 'storyText': None,
 'createdAt': '2014-04-03T19:02:53Z',
 'tags': ['story', 'author_platz', 'story_7525198'],
 'title': 'Brendan Eich Steps Down as Mozilla CEO',
 'objectId': '7525198'}

#### Lambda Functions

In [162]:
def unchanged(x):
    return x

In [163]:
unchanged_l = lambda x: x

In [164]:
def plus_one(x):
    return x + 1

In [165]:
plus_one_l = lambda x: x + 1

In [166]:
print(plus_one(3))
print(plus_one_l(3))

4
4


In [167]:
def add(x,y):
    return x + y

In [168]:
add_l = lambda x,y: x + y

In [169]:
print(add(3,4))
print(add_l(3,4))

7
7


#### Using a lambda function with sorted() to sort the items in our JSON list alphabetically by name

In [170]:
json_obj = [
    {
        "age": 36,
        "favorite_foods": ["Pumpkin", "Oatmeal"],
        "name": "Sabine"
    },
    {
        "age": 40,
        "favorite_foods": ["Chicken", "Pizza", "Chocolate"],
        "name": "Zoe"
    },
    {
        "age": 40,
        "favorite_foods": ["Caesar Salad"],
        "name": "Heidi"
    }
]

In [171]:
sorted(json_obj, key = lambda obj: obj['name'])

[{'age': 40, 'favorite_foods': ['Caesar Salad'], 'name': 'Heidi'},
 {'age': 36, 'favorite_foods': ['Pumpkin', 'Oatmeal'], 'name': 'Sabine'},
 {'age': 40,
  'favorite_foods': ['Chicken', 'Pizza', 'Chocolate'],
  'name': 'Zoe'}]

In [172]:
min(json_obj, key = lambda obj: obj['age'])

{'age': 36, 'favorite_foods': ['Pumpkin', 'Oatmeal'], 'name': 'Sabine'}

In [173]:
max(json_obj, key = lambda obj: obj['age'])

{'age': 40, 'favorite_foods': ['Chicken', 'Pizza', 'Chocolate'], 'name': 'Zoe'}

In [174]:
hn_sorted_points = sorted(hn_clean, key = lambda h: h['points'], reverse=True)
hn_sorted_points[0:3]

[{'author': 'frederfred',
  'numComments': 398,
  'points': 2732,
  'url': 'http://gabrielecirulli.github.io/2048/',
  'storyText': '',
  'createdAt': '2014-03-10T15:44:42Z',
  'tags': ['story', 'author_frederfred', 'story_7373566'],
  'title': '2048',
  'objectId': '7373566'},
 {'author': 'brokenparser',
  'numComments': 260,
  'points': 1958,
  'url': 'https://thedaywefightback.org/',
  'storyText': '',
  'createdAt': '2014-02-11T08:12:28Z',
  'tags': ['story', 'author_brokenparser', 'story_7216471'],
  'title': 'Today is The Day We Fight Back',
  'objectId': '7216471'},
 {'author': 'jamesbritt',
  'numComments': 308,
  'points': 1522,
  'url': 'https://plus.google.com/+CarmsPerez/posts/GnVTvQNgvpf',
  'storyText': None,
  'createdAt': '2014-01-19T22:46:05Z',
  'tags': ['story', 'author_jamesbritt', 'story_7086411'],
  'title': 'Wozniak: “Actually, the movie was largely a lie about me”',
  'objectId': '7086411'}]

In [175]:
[h['title'] for h in hn_sorted_points][0:5]

['2048',
 'Today is The Day We Fight Back',
 'Wozniak: “Actually, the movie was largely a lie about me”',
 'Microsoft Open Sources C# Compiler',
 'Elon Musk: To the People of New Jersey']

#### Json with pandas

In [176]:
json_df = pd.DataFrame(json_obj)
json_df

Unnamed: 0,age,favorite_foods,name
0,36,"[Pumpkin, Oatmeal]",Sabine
1,40,"[Chicken, Pizza, Chocolate]",Zoe
2,40,[Caesar Salad],Heidi


In [177]:
hn_df = pd.DataFrame(hn_clean)
hn_df.head(5)

Unnamed: 0,author,numComments,points,url,storyText,createdAt,tags,title,objectId
0,dragongraphics,0,2,http://ashleynolan.co.uk/blog/are-we-getting-t...,,2014-05-29T08:07:50Z,"[story, author_dragongraphics, story_7815238]",Are we getting too Sassy? Weighing up micro-op...,7815238
1,jcr,0,1,http://spectrum.ieee.org/automaton/robotics/ho...,,2014-05-29T08:05:58Z,"[story, author_jcr, story_7815234]",Telemba Turns Your Old Roomba and Tablet Into ...,7815234
2,callum85,0,1,http://online.wsj.com/articles/apple-to-buy-be...,,2014-05-29T08:05:06Z,"[story, author_callum85, story_7815230]",Apple Agrees to Buy Beats for $3 Billion,7815230
3,d3v3r0,0,1,http://alexsblog.org/2014/05/29/dont-wait-for-...,,2014-05-29T08:00:08Z,"[story, author_d3v3r0, story_7815222]",Don’t wait for inspiration,7815222
4,timmipetit,0,1,http://techcrunch.com/2014/05/28/hackerone-get...,,2014-05-29T07:46:19Z,"[story, author_timmipetit, story_7815191]",HackerOne Get $9M In Series A Funding To Build...,7815191


In [178]:
tags = hn_df.tags
print(tags.dtype)

object


In [179]:
tags_types = tags.apply(type)
type_counts = tags_types.value_counts(dropna=False)
print(type_counts)

<class 'list'>    35806
Name: tags, dtype: int64


In [180]:
tags_types = tags.apply(len)
type_lengths = tags_types.value_counts(dropna=False)
print(type_lengths)

3    33459
4     2347
Name: tags, dtype: int64


##### Boolean mask to filter rows with 4 tags 

In [181]:
bm = tags.apply(len) == 4

In [182]:
tags[bm]

43       [story, author_alamgir_mand, story_7813869, sh...
86         [story, author_cweagans, story_7812404, ask_hn]
104      [story, author_nightstrike789, story_7812099, ...
107      [story, author_ISeemToBeAVerb, story_7812048, ...
109         [story, author_Swizec, story_7812018, show_hn]
                               ...                        
35747      [story, author_rpm4321, story_6994970, show_hn]
35759            [story, author_ct, story_6994828, ask_hn]
35778    [story, author_ChrisNorstrom, story_6994370, a...
35787    [story, author_benjamincburns, story_6994163, ...
35792      [story, author_randall, story_6993981, show_hn]
Name: tags, Length: 2347, dtype: object

##### Lambda function to extract the fourth value in cases where there is one

In [183]:
def extract_last_tag(l):
    if len(l) == 4:
        return l[-1]
    else:
        return None

In [184]:
extract_last_tag = lambda l: l[-1] if len(l) == 4 else None 

In [185]:
last_tag = tags.apply(extract_last_tag)

In [186]:
last_tag.value_counts()

ask_hn     1348
show_hn     999
Name: tags, dtype: int64

In [187]:
hn_df.tags = last_tag

In [188]:
hn_df.head()

Unnamed: 0,author,numComments,points,url,storyText,createdAt,tags,title,objectId
0,dragongraphics,0,2,http://ashleynolan.co.uk/blog/are-we-getting-t...,,2014-05-29T08:07:50Z,,Are we getting too Sassy? Weighing up micro-op...,7815238
1,jcr,0,1,http://spectrum.ieee.org/automaton/robotics/ho...,,2014-05-29T08:05:58Z,,Telemba Turns Your Old Roomba and Tablet Into ...,7815234
2,callum85,0,1,http://online.wsj.com/articles/apple-to-buy-be...,,2014-05-29T08:05:06Z,,Apple Agrees to Buy Beats for $3 Billion,7815230
3,d3v3r0,0,1,http://alexsblog.org/2014/05/29/dont-wait-for-...,,2014-05-29T08:00:08Z,,Don’t wait for inspiration,7815222
4,timmipetit,0,1,http://techcrunch.com/2014/05/28/hackerone-get...,,2014-05-29T07:46:19Z,,HackerOne Get $9M In Series A Funding To Build...,7815191


In [189]:
hn_df.tags.isnull()

0        True
1        True
2        True
3        True
4        True
         ... 
35801    True
35802    True
35803    True
35804    True
35805    True
Name: tags, Length: 35806, dtype: bool

In [190]:
hn_df[hn_df.tags.isnull()].head(3)

Unnamed: 0,author,numComments,points,url,storyText,createdAt,tags,title,objectId
0,dragongraphics,0,2,http://ashleynolan.co.uk/blog/are-we-getting-t...,,2014-05-29T08:07:50Z,,Are we getting too Sassy? Weighing up micro-op...,7815238
1,jcr,0,1,http://spectrum.ieee.org/automaton/robotics/ho...,,2014-05-29T08:05:58Z,,Telemba Turns Your Old Roomba and Tablet Into ...,7815234
2,callum85,0,1,http://online.wsj.com/articles/apple-to-buy-be...,,2014-05-29T08:05:06Z,,Apple Agrees to Buy Beats for $3 Billion,7815230


In [191]:
hn_df[hn_df.tags.notnull()].head(3)

Unnamed: 0,author,numComments,points,url,storyText,createdAt,tags,title,objectId
43,alamgir_mand,0,3,https://www.tapresearch.com/app-development-re...,,2014-05-29T00:19:19Z,show_hn,Show HN: Test your App Store Assets Before You...,7813869
86,cweagans,4,1,,As a followup to my question from a few days a...,2014-05-28T19:51:02Z,ask_hn,Ask HN: New technical cofounder. How should ow...,7812404
104,nightstrike789,0,1,,I am working on a personal project to help peo...,2014-05-28T18:57:32Z,ask_hn,Ask HN: Categorizing company cultures,7812099
