In [1]:
import os
import time
import requests
import base64
import pandas as pd
import json
import config as cfg

In [2]:
header_string = cfg.ravkey['username'] + ':' + cfg.ravkey['password']
auth_header_bytes = base64.urlsafe_b64encode(header_string.encode('utf-8'))
auth_header = str(auth_header_bytes, 'utf-8')

headers = { "Authorization": "Basic " + auth_header }
pattern_attr = requests.get('https://api.ravelry.com/pattern_attributes/groups.json', headers = headers)

In [None]:

pattern_attributes = json.dumps(pattern_attr.json(), sort_keys = True, indent = 2)
print(pattern_attributes)

In [None]:

pattern_cat = requests.get('https://api.ravelry.com/pattern_categories/list.json', headers = headers)
pattern_categories = json.dumps(pattern_cat.json(), sort_keys = True, indent = 2)
print(pattern_categories)

{
            "children": [
              {
                "children": [],
                "id": 304,
                "long_name": "Cardigan",
                "name": "Cardigan",
                "permalink": "cardigan"
              },
              {
                "children": [],
                "id": 306,
                "long_name": "Pullover",
                "name": "Pullover",
                "permalink": "pullover"
              },
              {
                "children": [],
                "id": 897,
                "long_name": "Sweater - Other",
                "name": "Other",
                "permalink": "other-sweater"
              }
            ],
            "id": 319,
            "long_name": "Sweater",
            "name": "Sweater",
            "permalink": "sweater"
          },
          
Chunk containing sweater category and subcategories; 'id': 319, 'name': 'Sweater', 'permalink': 'sweater' are probably the parts I need.

```GET /patterns/search.json?id=319&page_size=200```

Params include query=string (fulltext search), default of page=1, default of page_size=100, personal_attributes=1 to return personal_attributes hash in the result object (i.e. what I have to say about it on Ravelry - do not need this). Returns a pattern (list) which includes the first photo and the permalink.

```GET /patterns/{id}/projects.json```

Retrieve projects associated with a particular pattern id. Default sort order is 'completed', with completion date in descending order. ```photoless=1``` gets projects with photos (I believe). Defaults to page=1, page_size=100

```GET /photos/{id}/sizes.json```

Takes an integer photo id and returns all available sizes? Seems like I can get links from an API request and then I should just scrape pages as per scraping_ravelry.ipynb

In [None]:
pattern_list = requests.get('https://api.ravelry.com/patterns/search.json?id=319&page_size=300', headers = headers)
pattern_list = json.dumps(pattern_list.json(), sort_keys = True, indent = 2)

In [None]:
with open('pattern_list.json', 'w') as fp:
    json.dump(pattern_list, fp)

In [None]:
pat_list = requests.get('https://api.ravelry.com/patterns/search.json?id=319&page_size=300', headers = headers)

In [None]:
# integer pattern id
pat_list['patterns'][1]['id']
# pattern permalink
pat_list['patterns'][1]['permalink']
# designer photo; small2 is 320 on longest side while small is 240 on longest side
pat_list['patterns'][1]['first_photo']['small2_url']
# pattern url
pat_list['patterns'][1]['pattern_sources'][0]['url']

In [None]:
for i in range(300):
    count = 0
    if not pat_list['patterns'][i]['first_photo']['small2_url']:
        count += 1
count

In [None]:
pattern_dict = {}
for i in range(300):
    key = pat_list['patterns'][i]['permalink']
    temp = {}
    temp['id'] = pat_list['patterns'][i]['id']
    temp['url'] = pat_list['patterns'][i]['pattern_sources'][0]['url']
    temp['photo urls'] = [pat_list['patterns'][1]['first_photo']['small2_url']]
    pattern_dict[key] = temp

In [None]:
# /patterns/{id}/projects.json?page_size=20&photoless=1

project_dump = {}

for key in pattern_dict.keys():
    id_num = pattern_dict[key]['id']
    response = requests.get(f'https://api.ravelry.com/patterns/{id_num}/projects.json?page_size=20&photoless=1', headers = headers)
    project_dump[key] = response

In [None]:
pattern_dict['stripes-32']['id']

In [None]:
requests.get('https://api.ravelry.com/patterns/1087512/projects.json', headers = headers)

Poking around on the Ravelry API boards, it looks like this is an issue that other people have also encountered, meaning that I need to go from basic to OAuth authentication.

Additionally, on discussion with Kelsey, it appears that I really need to have a LOT more photos and far fewer categories. (Ideally several times more photos for each category than total number of categories.)
1. Manually chunk/bin based on appearance (very needed as it pulled down patterns of all categories; harumph)
2. Get pictures from pattern page
3. Get more pictures of projects by others

In [None]:
for key in ['stripes-32']:
    file_path = 'img/' + key + '.jpg'
    picture = requests.get(pattern_dict[key]['photo urls'][0], headers = headers)
    with open(file_path, 'wb') as fp:
        fp.write(picture.content)

In [None]:
pattern_dict['stripes-32']['photo urls'][0]

Attempting to get more photos in this fashion resulted in a bunch of duplicates of the same photo over and over and over again (anti-bot measure?). This is maddening.

1. Going to a search page to figure out what the syntax is for searching for sweaters and getting my 300 patterns that way. (it's obviously not ID?)
2. Figuring out what the syntax is for searching for projects by pattern id in case that's a quick way to get the links to user project pages I'll need to scrape.

In [26]:
# per Cassidy (gal behind Ravelry's API):
url = 'https://api.ravelry.com/patterns/search.json?pc=sweater&page_size=250'
patterns = requests.get(url, headers=headers)

# Convert to more readable format and save
pat_list = json.dumps(patterns.json(), sort_keys = True, indent = 2)
with open('pattern_list.json', 'w+') as fp:
    json.dump(pat_list, fp)

# Parsing out the most useful information

pat_list = patterns.json()
pattern_dict = {}
for i in range(250):
    key = pat_list['patterns'][i]['permalink']
    temp = {}
    temp['id'] = pat_list['patterns'][i]['id']
    temp['url'] = pat_list['patterns'][i]['pattern_sources'][0]['url']
    temp['photo urls'] = [pat_list['patterns'][i]['first_photo']['small2_url']]
    pattern_dict[key] = temp

In [15]:
pattern_dict['stripes-32']

{'id': 1087512,
 'url': 'https://www.ravelry.com/stores/dreareneeknits',
 'photo urls': ['https://images4-g.ravelrycache.com/uploads/knitabitknitwit/744588460/upload_small2']}

In [30]:
from fake_useragent import UserAgent
import urllib.request

ua = UserAgent()
user_agent = {'User-agent': ua.random}
for key in pattern_dict.keys():
    train_count = 1
    link = pattern_dict[key]['photo urls'][0]
    file_path = 'img/' + key + '_' + str(train_count).zfill(3) + '.jpg'
    urllib.request.urlretrieve(link, file_path)
    time.sleep(15)

In [31]:
pattern_dict['stripes-32']

{'id': 1087512,
 'url': 'https://www.ravelry.com/stores/dreareneeknits',
 'photo urls': ['https://images4-g.ravelrycache.com/uploads/dreareneeknits/743362891/Custom-8_small2.jpg']}

In [32]:
url = 'https://api.ravelry.com//patterns/1087512/projects.json?page_size=25'
project = requests.get(url, headers=headers)

# Convert to more readable format and save
test_project = json.dumps(project.json(), sort_keys = True, indent = 2)
with open('pattern_list.json', 'w+') as fp:
    json.dump(test_project, fp)


JSONDecodeError: Extra data: line 1 column 5 (char 4)

## What's the problem:
OAuth is fundamentally about a multistep dance: my app has specific permissions from Ravelry, including permissions to muck about with a certain subset of data from a logged-in user. Therefore, I need to be able to actually log in a user before the app has access to data. I'm starting off by using a library (ipyauth) that will allow me to do the whole rigamarole from within Jupyter. If that fails, I'll start scraping procedures and then work on setting up an auth page either locally or on my website.

In [None]:
from ipyauth import Auth
host_url = 'https://www.ravelry.com'
request_url = 'https://www.ravelry.com/oauth/request_token'
access_url = 'https://www.ravelry.com/oauth/access_token'
auth_url = 'https://www.ravelry.com/oauth/authorize'
redirect_uri = 'https://localhost:8888/callback'
a = Auth()

Ooooor that's an almost undocumented library and I don't know enough about OAuth protocols to be able to figure it out from the code. New plan: Postman app for retrieving information via API. Can come back to this later and sort things out if I feel like it's important/necessary.

In [38]:
header_string = cfg.ravkey['username'] + ':' + cfg.ravkey['password']
auth_header_bytes = base64.urlsafe_b64encode(header_string.encode('utf-8'))
auth_header = str(auth_header_bytes, 'utf-8')

headers = { "Authorization": "Basic " + auth_header }

In [40]:
url = 'https://api.ravelry.com/projects/search.json?query=1087512&sort=completed'
project = requests.get(url, headers=headers)
project.content

b'{"projects": [], "paginator": {"page_count":1,"page":1,"page_size":0,"results":0,"last_page":1}}'

In [42]:
pattern_dict

{'womens-knitted-sweater-2': {'id': 1088978,
  'url': 'https://www.novitaknits.com/fi',
  'photo urls': ['https://images4-g.ravelrycache.com/uploads/knitabitknitwit/744566300/upload_small2']},
 'hooded-baby-cardigan-5': {'id': 1089008,
  'url': 'https://www.novitaknits.com/fi',
  'photo urls': ['https://images4-g.ravelrycache.com/uploads/knitabitknitwit/744588460/upload_small2']},
 'love-note': {'id': 927223,
  'url': 'http://www.ravelry.com/stores/tin-can-knits-designs',
  'photo urls': ['https://images4-g.ravelrycache.com/uploads/tincanknits/624440924/TCK-lovenote-01a_small2.jpg']},
 'womens-cabled-cardigan': {'id': 1088911,
  'url': 'https://www.novitaknits.com/fi',
  'photo urls': ['https://images4-g.ravelrycache.com/uploads/knitabitknitwit/744521115/upload_small2']},
 'soldotna-crop': {'id': 910492,
  'url': 'https://www.ravelry.com/stores/boyland-knitworks',
  'photo urls': ['https://images4-f.ravelrycache.com/uploads/boylandknitworks/612664255/Attachment-1_2_small2.jpeg']},
 'th