In [1]:
import requests
from pprint import pprint
from prettytable import PrettyTable

In [2]:
ka_api = 'http://api-explorer.khanacademy.org'
ka = 'http://khanacademy.org'

exercises = '/api/v1/exercises'
topictree = '/api/v1/topictree'

req = '{}{}'.format(ka, topictree)
print('Requesting', req)

r = requests.get(req)

Requesting http://khanacademy.org/api/v1/topictree


In [3]:
len(r.text)

109203939

In [4]:
resp = r.json()

In [5]:
len(resp)

47

In [6]:
type(resp)

dict

In [7]:
table = PrettyTable()
table.field_names = ['Key', 'Type']

for key in resp.keys():
    table.add_row([key, type(resp[key])])
    
print(table)

+----------------------------------+--------------------+
|               Key                |        Type        |
+----------------------------------+--------------------+
|            gplus_url             |   <class 'str'>    |
| has_user_authored_content_types  |   <class 'bool'>   |
|         standalone_title         |   <class 'str'>    |
|               hide               |   <class 'bool'>   |
|           content_kind           |   <class 'str'>    |
|           domain_slug            | <class 'NoneType'> |
|              ka_url              |   <class 'str'>    |
|         translated_title         |   <class 'str'>    |
|     background_image_caption     |   <class 'str'>    |
|    has_peer_reviewed_projects    |   <class 'bool'>   |
|       background_image_url       |   <class 'str'>    |
|             icon_src             |   <class 'str'>    |
|           endorsement            | <class 'NoneType'> |
|         alternate_slugs          |   <class 'list'>   |
|          log

In [8]:
len(resp['children'])

14

In [9]:
resp['children'][0].keys()

dict_keys(['gplus_url', 'has_user_authored_content_types', 'standalone_title', 'hide', 'content_kind', 'domain_slug', 'ka_url', 'translated_title', 'background_image_caption', 'has_peer_reviewed_projects', 'background_image_url', 'icon_src', 'endorsement', 'alternate_slugs', 'logo_image_url', 'extended_slug', 'web_url', 'node_slug', 'relative_url', 'deleted_mod_time', 'render_type', 'description', 'listed_locales', 'branding_image_url', 'topic_page_url', 'user_authored_content_types', 'children', 'do_not_publish', 'kind', 'slug', 'id', 'child_data', 'translated_standalone_title', 'twitter_url', 'title', 'facebook_url', 'user_authored_content_types_info', 'tags', 'creation_date', 'curriculum_key', 'deleted', 'translated_description', 'sha', 'in_knowledge_map', 'current_revision_key', 'content_id', 'curation'])

In [10]:
def structure(d, spacing='', level=0):
    if level > 2:
        return
    
    print(spacing, d['node_slug'])
    print(spacing, '  url:', ka + d['relative_url'])
    print(spacing, '  title:', d['title'])
    
    if not 'children' in d:
        return
    
    for children in d['children']:
        structure(children, spacing + '   ', level + 1)

In [11]:
for children in resp['children']:
    print(children['node_slug'])

math
science
economics-finance-domain
humanities
computing
test-prep
educator-test
partner-content
talks-and-interviews
college-careers-more
talent-search
resources
mappers
kmap


In [18]:
def get_section_by_slug(d, slug):
    if d['node_slug'] == slug:
        return d
        
    if not 'children' in d:
        return None
    
    for children in d['children']:
        slug_dict = get_section_by_slug(children, slug)
        if slug_dict:
            return slug_dict
    
    return None

In [19]:
em = get_section_by_slug(resp, 'early-math')

In [20]:
type(em)

dict

In [22]:
for course in resp['children']:
    if course['slug'] != 'math':
        continue
        
    print('{} [slug: {}]'.format(course['title'], course['slug']))
    print('Sections:')
    for section in course['children']:
        print('\t{} [section slug: {}]'.format(section['title'], section['slug']))
    print()

Math [slug: math]
Sections:
	K-8th grades [section slug: k-8-grades]
	Eureka Math/EngageNY support [section slug: engageny]
	Remediation support for Eureka Math/EngageNY [section slug: topic-foundations-engageny]
	On-grade support for Eureka Math/EngageNY [section slug: on-grade-engageny]
	High school math [section slug: math-1-2-3]
	Class 5 math (India) [section slug: in-fifth-grade-math]
	Class 6 math (India)  [section slug: in-in-class-6th-math-cbse]
	Class 7 math (India) [section slug: in-in-class-7th-math-cbse]
	Class 8 math (India) [section slug: in-in-class-8th-math-cbse]
	Class 9 math (India) [section slug: in-in-grade-9-ncert]
	Class 10 math (India) [section slug: in-in-grade-10-ncert]
	Class 11 math (India) [section slug: in-in-grade-11-ncert]
	Class 12 math (India) [section slug: in-in-grade-12-ncert]
	Class 6 Math (India) - Hindi  [section slug: in-in-class-6-math-cbse-hindi]
	Class 7 Math (India) - Hindi [section slug: in-in-class-7th-math-cbse-hindi]
	Class 8 Math (India)

In [57]:
math = get_section_by_slug(resp, 'math')

In [58]:
table = PrettyTable()
table.field_names = ['Key', 'Type']

for key in math.keys():
    table.add_row([key, type(math[key])])
    
print(table)

+----------------------------------+--------------------+
|               Key                |        Type        |
+----------------------------------+--------------------+
|            gplus_url             |   <class 'str'>    |
| has_user_authored_content_types  |   <class 'bool'>   |
|         standalone_title         |   <class 'str'>    |
|               hide               |   <class 'bool'>   |
|           content_kind           |   <class 'str'>    |
|           domain_slug            |   <class 'str'>    |
|              ka_url              |   <class 'str'>    |
|         translated_title         |   <class 'str'>    |
|     background_image_caption     |   <class 'str'>    |
|    has_peer_reviewed_projects    |   <class 'bool'>   |
|       background_image_url       |   <class 'str'>    |
|             icon_src             |   <class 'str'>    |
|           endorsement            | <class 'NoneType'> |
|         alternate_slugs          |   <class 'list'>   |
|          log

In [59]:
early_math = get_section_by_slug(resp, 'early-math')

In [60]:
type(early_math)

dict

In [None]:
structure(early_math)

In [28]:
def extract_video_lengths(d):
#     import ipdb; ipdb.set_trace()
    if not 'children' in d and d['kind'] == 'Video':
        return (d['youtube_id'], d['duration'])
    
    vids = list()
    
    for children in d['children']:
        vids.append(extract_video_lengths(children))
        
    return vids

In [29]:
vids = extract_video_lengths(early_math)

In [30]:
len(vids)

8

In [31]:
vids

[[[('y2-uaPiyoxc', 56), ('PEeUTQ0Gri8', 118)],
  [('9XZypM2Z3Ro', 298), ('1AqkBdCBm9o', 114)],
  [('leDYnoNSvD4', 73), ('I9S5CvSqb5A', 63), ('EUqhLxFccbM', 104)],
  [('__nkbr6DeTg', 123), ('tJrSILRXOUc', 91), ('UA975j_qsTQ', 44)]],
 [[('fsTD_jqseBA', 191), ('AO9bHbUdg-M', 219)],
  [('A-ykhY_IoaU', 246)],
  [('An46SYAxhtc', 114), ('9FC0WT186aY', 120)],
  [('ie0waMJxnTs', 137), ('kpEJwpemL2Q', 135)],
  [('zVLjWIftX_o', 125)],
  [('-3DFzxbP9Fk', 102), ('qSkpZswoZTc', 63)]],
 [[('ourH3ueWNmA', 180), ('zqwVKhQV_2w', 178)],
  [('wx2gI8iwMCA', 359), ('X_PnRFAKbkg', 175), ('-Zlq5tNl94M', 259)],
  [('nFsQA2Zvy1o', 304)],
  [],
  [('9Jg5S7F2SMQ', 124)]],
 [[('ZgzpTx-s9Zo', 233), ('AK0NZITv5Ns', 199), ('Utm875JRi-o', 158)],
  [('9McJ3GobPaY', 243)],
  [('gEGKvx1wlFg', 296)],
  [],
  [('fGok2nHOjnI', 218), ('SfgD7Sm08ns', 332)],
  [('osIFa9zcI-w', 203), ('3rs7sFPoiZ4', 194)],
  [('j5c6pqAP2IA', 129)]],
 [[('mKsKU0BAiRM', 137), ('Dy5uDkOoMNc', 196), ('7_QPAdHILzw', 156)],
  [('8YR1E7XWPz8', 152), (

In [69]:
from collections import deque

def flatten(nested_list):
    res_list = list()
    d = deque()
    for x in nested_list:
        d.append(x)
    
    while len(d) > 0:
        curr_item = d.popleft()
        if type(curr_item) is list:
            for x in curr_item:
                d.append(x)
        else:
            res_list.append(curr_item)
        
    return res_list
    

In [70]:
import itertools

flat_vids = flatten(vids)

In [71]:
flat_vids[:10]

[('y2-uaPiyoxc', 56),
 ('PEeUTQ0Gri8', 118),
 ('9XZypM2Z3Ro', 298),
 ('1AqkBdCBm9o', 114),
 ('leDYnoNSvD4', 73),
 ('I9S5CvSqb5A', 63),
 ('EUqhLxFccbM', 104),
 ('__nkbr6DeTg', 123),
 ('tJrSILRXOUc', 91),
 ('UA975j_qsTQ', 44)]

In [74]:
from collections import Counter
Counter([v[0] for v in flat_vids])

Counter({'-3DFzxbP9Fk': 1,
         '-Zlq5tNl94M': 1,
         '0B91xPrwcPE': 1,
         '0lSTXtwPuOU': 1,
         '10dTx1Zy_4w': 2,
         '1AqkBdCBm9o': 1,
         '1dZsuE0vxEI': 1,
         '3rs7sFPoiZ4': 1,
         '4BNIGTHUTTM': 1,
         '7_QPAdHILzw': 1,
         '8YR1E7XWPz8': 1,
         '8mcTsyV56jI': 1,
         '8xbIS2UkQxI': 2,
         '9FC0WT186aY': 1,
         '9Jg5S7F2SMQ': 1,
         '9McJ3GobPaY': 1,
         '9XZypM2Z3Ro': 1,
         'A-ykhY_IoaU': 1,
         'AK0NZITv5Ns': 1,
         'AO9bHbUdg-M': 1,
         'An46SYAxhtc': 1,
         'AtiOjlyOQf4': 1,
         'DvV0e5F98NQ': 1,
         'Dy5uDkOoMNc': 1,
         'DzJvR56Suss': 2,
         'EQrCdEF3vNE': 1,
         'EUqhLxFccbM': 1,
         'EV38zfiY6Vs': 1,
         'Ei5mgFtUGns': 1,
         'FWEqB0J6mgA': 1,
         'G1cNKc3PD74': 1,
         'HN6FNS7lRhw': 2,
         'Hc9mcx739js': 1,
         'I5xcZgyY4ag': 1,
         'I9S5CvSqb5A': 1,
         'JCdbCdwqXbc': 1,
         'K3GV13uokbk': 1,
 

In [35]:
vids_dict = { v[0] : v[1] for v in flat_vids }

In [77]:
section_duration = sum([v for k, v in vids_dict.items()])
section_duration

19284

In [78]:
import datetime
str(datetime.timedelta(seconds=int(section_duration)))

'5:21:24'

<hr>

In [79]:
math_vids = extract_video_lengths(math)

In [81]:
import itertools

math_vids = flatten(math_vids)

In [82]:
len(math_vids)

16503

In [83]:
math_vids_dict = { v[0] : v[1] for v in math_vids }

In [84]:
math_duration = sum([v for k, v in math_vids_dict.items()])
math_duration

1766276

In [85]:
str(datetime.timedelta(seconds=int(math_duration)))

'20 days, 10:37:56'

<hr>

In [61]:
em_vids = extract_video_lengths(em)

In [87]:
em_vids = flatten(em_vids)

In [88]:
len(em_vids)

106

In [89]:
em_vids_dict = { v[0] : v[1] for v in em_vids }

In [90]:
em_duration = np.sum([v for k, v in em_vids_dict.items()])
em_duration

19284

In [91]:
str(datetime.timedelta(seconds=int(em_duration)))

'5:21:24'