## Pipeline 1: Download

### Load Packages

In [1]:
import os

os.chdir('../')

In [3]:
import pandas as pd
import lib.database_module as db
import lib.encoding_module as enc
import lib.wiki_module as wiki

### Present: Command is executed via command line script

In [4]:
command, cur = db.connect_to_postgres()

Connected to server joshuacook.me.


### Acquire: Get categories to query from a yaml file

In [5]:
import yaml

In [6]:
with open('data/categories.yml') as file: 
    categories = (yaml.load(file))

In [7]:
categories

{'categories': ['Submarine sandwich restaurants', 'Belief']}

### Acquire: Query Wikipedia by category for pages

In [8]:
responses = []

for cat in categories['categories'][:1]:
    response = wiki.query_category(cat)
    responses.append(response)

In [9]:
cat_ids = [response['categoryid'] for response in responses]
cat_ids

['24580905']

In [10]:
zipped = zip(categories['categories'], cat_ids)
zipped

[('Submarine sandwich restaurants', '24580905')]

In [11]:
for cat in zipped:
    print(cat)
    db.create_or_update_category_in_database(cat[1], cat[0])

('Submarine sandwich restaurants', '24580905')
Connected to server joshuacook.me.


In [12]:
pd.DataFrame(responses[0]['pages']).head()

Unnamed: 0,pageid,title
0,43460147,List of submarine sandwich restaurants
1,31940835,Big Bite Submarines
2,32012383,Big John Steak & Onion
3,2371008,Blimpie
4,5778816,Capriotti's


### Acquire: Write category info to database

In [13]:
category = wiki.query_category("breads")
db.create_or_update_category_in_database('839059', 'breads')

pages = category['pages']
for page in pages:
    print page['pageid']
    page_res = wiki.query_page(str(page['pageid']))
    db.create_or_update_page_in_database(page_res['pageid'], '839059', page_res['summary'], page_res['text'])

Connected to server joshuacook.me.
36969
Connected to server joshuacook.me.
22459546
Connected to server joshuacook.me.
14479347
Connected to server joshuacook.me.
1702260
Connected to server joshuacook.me.
18488345
Connected to server joshuacook.me.
856952
Connected to server joshuacook.me.
3025935
Connected to server joshuacook.me.
194685
Connected to server joshuacook.me.
857888
Connected to server joshuacook.me.
1681631
Connected to server joshuacook.me.
38675838
Connected to server joshuacook.me.
2977753
Connected to server joshuacook.me.
49961869
Connected to server joshuacook.me.
5697855
Connected to server joshuacook.me.
42277357
Connected to server joshuacook.me.
6036209
Connected to server joshuacook.me.
17945187
Connected to server joshuacook.me.
41220266
Connected to server joshuacook.me.
621040
Connected to server joshuacook.me.
1277062
Connected to server joshuacook.me.
968862
Connected to server joshuacook.me.
578553
Connected to server joshuacook.me.
866956
Connected to

### Acquire: Query Wikipedia by page id for content

In [23]:
page_response = wiki.query_page("31940835")
pg_31940835_txt = page_response['text']
pg_31940835_txt

u'Big Bite Submarines is a Norwegian fastfood franchise focusing on fresh subs, wraps and salads. The first restaurant was opened in 1997. The stores are typically located in shopping malls. The chain operates 53 restaurants all over Norway. External links official website'

In [15]:
page_response.keys()

['pageid', 'summary', 'text', 'html']

In [24]:
db.create_or_update_page_in_database(31940835,
                                     24580905,
                                     'breads',
                                     page_response['text'])

Connected to server joshuacook.me.


'OK'

In [25]:
response

{'categoryid': '24580905',
 'pages': [{'pageid': 43460147,
   'title': 'List of submarine sandwich restaurants'},
  {'pageid': 31940835, 'title': 'Big Bite Submarines'},
  {'pageid': 32012383, 'title': 'Big John Steak & Onion'},
  {'pageid': 2371008, 'title': 'Blimpie'},
  {'pageid': 5778816, 'title': "Capriotti's"},
  {'pageid': 6586487, 'title': "Charley's Grilled Subs"},
  {'pageid': 41988612, 'title': 'Template:Cheesesteak'},
  {'pageid': 5990698, 'title': 'Cousins Subs'},
  {'pageid': 2321267, 'title': "D'Angelo Sandwich Shops"},
  {'pageid': 49400668, 'title': "Dalessandro's Steaks"},
  {'pageid': 11588636, 'title': "DiBella's"},
  {'pageid': 2637586, 'title': 'Earl of Sandwich (restaurant)'},
  {'pageid': 6184077, 'title': "Erbert & Gerbert's"},
  {'pageid': 1419741, 'title': 'Firehouse Subs'},
  {'pageid': 1079651, 'title': "Geno's Steaks"},
  {'pageid': 8947332, 'title': "Jerry's Subs & Pizza"},
  {'pageid': 5080995, 'title': "Jersey Mike's Subs"},
  {'pageid': 41981819, 'titl

### Acquire: Write page content to database

In [26]:
response.keys()

['categoryid', 'pages']

In [27]:
response['pages'][0]

{'pageid': 43460147, 'title': 'List of submarine sandwich restaurants'}

In [29]:
pageid_list = []
title_list = []
counter = 0
for page in response['pages']:
    pageid_list.append(response['pages'][counter]['pageid'])
    title_list.append(response['pages'][counter]['title'])
    counter += 1
    
print pageid_list

[43460147, 31940835, 32012383, 2371008, 5778816, 6586487, 41988612, 5990698, 2321267, 49400668, 11588636, 2637586, 6184077, 1419741, 1079651, 8947332, 5080995, 41981819, 1703557, 50192420, 45068874, 10405436, 11027136, 4828182, 3494709, 447854, 828487, 5052187, 3552347, 3326158, 1617094, 243649, 2959119, 13705413, 6793467, 49401105, 6834956, 54581, 1352724, 41991980, 18986355, 3175570, 34221056]


In [30]:
print title_list

['List of submarine sandwich restaurants', 'Big Bite Submarines', 'Big John Steak & Onion', 'Blimpie', "Capriotti's", "Charley's Grilled Subs", 'Template:Cheesesteak', 'Cousins Subs', "D'Angelo Sandwich Shops", "Dalessandro's Steaks", "DiBella's", 'Earl of Sandwich (restaurant)', "Erbert & Gerbert's", 'Firehouse Subs', "Geno's Steaks", "Jerry's Subs & Pizza", "Jersey Mike's Subs", "Jim's Steaks", "Jimmy John's", "Joe's Steaks + Soda Shop", "John's Roast Pork", "Larry's Giant Subs", "Lenny's Sub Shop", "Milio's Sandwiches", "Moe's Italian Sandwiches", 'Mr. Sub', "Pat's King of Steaks", 'Penn Station (restaurant)', 'Planet Sub', 'Port of Subs', 'Potbelly Sandwich Works', 'Quiznos', "Schlotzsky's", 'Spicy Pickle', 'Steak Escape', "Steve's Prince of Steaks", 'Submarina', 'Subway (restaurant)', "Togo's", "Tony Luke's", "Tubby's", 'Upper Crust (restaurant chain)', "Zero's Subs"]


In [32]:
zipped_title_pageid = zip(pageid_list, title_list)
print zipped_title_pageid

[(43460147, 'List of submarine sandwich restaurants'), (31940835, 'Big Bite Submarines'), (32012383, 'Big John Steak & Onion'), (2371008, 'Blimpie'), (5778816, "Capriotti's"), (6586487, "Charley's Grilled Subs"), (41988612, 'Template:Cheesesteak'), (5990698, 'Cousins Subs'), (2321267, "D'Angelo Sandwich Shops"), (49400668, "Dalessandro's Steaks"), (11588636, "DiBella's"), (2637586, 'Earl of Sandwich (restaurant)'), (6184077, "Erbert & Gerbert's"), (1419741, 'Firehouse Subs'), (1079651, "Geno's Steaks"), (8947332, "Jerry's Subs & Pizza"), (5080995, "Jersey Mike's Subs"), (41981819, "Jim's Steaks"), (1703557, "Jimmy John's"), (50192420, "Joe's Steaks + Soda Shop"), (45068874, "John's Roast Pork"), (10405436, "Larry's Giant Subs"), (11027136, "Lenny's Sub Shop"), (4828182, "Milio's Sandwiches"), (3494709, "Moe's Italian Sandwiches"), (447854, 'Mr. Sub'), (828487, "Pat's King of Steaks"), (5052187, 'Penn Station (restaurant)'), (3552347, 'Planet Sub'), (3326158, 'Port of Subs'), (1617094, 

In [33]:
for page, title in zipped_title_pageid:
    page_response = wiki.query_page(str(page))
    db.create_or_update_page_in_database(page, int(cat_ids[0]), title, page_response['text'])

Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server joshuacook.me.
Connected to server 