In [1]:
import requests
import pandas as pd
import json
import csv
import re

In [2]:
def det_type(da):
    #goal: determine the type and create a list of index for the kanji, radicals, and vocab in a lesson
    #input: da - data pulled from wanikani
    #output: k - list of kanji indexes in lesson
    #        r - list of radical indexes in lesson
    #        v - list of vocab indexes in lesson
    
    count = 0       #initialize index reference
    k=[]            #initialize list for each type
    r=[]
    v=[]
    for i in da:            #go through each item in the data; determine the type; add the index to the appropriate list
        data = i['data']
        #print(i['id'])
        if i['object'] == 'kanji':
            k.append(count)
        elif i['object'] == 'vocabulary':
            v.append(count)
        elif i['object'] == 'radical':
            r.append(count)
        else:                               #added as a precaution in case there are any items without a type
            print('extra count' +count)
        count = count+1
    return k, r, v

In [3]:
def create_kanjidf(k, da):
    #goal: create the kanji dataframe with all of the kanji for one lesson and clean it up so it can easily be turned into a deck in anki
    #input: k - list of indexes for the kanji in the lesson
    #       da - the data pulled from wanikani
    #output: kanjid - the cleaned up dataframe with all the necessary information needed to create a useful anki deck
    
    kanjid = pd.DataFrame(columns = ['id','object','updated','url','level','characters','meanings','aux_mean',
                                 'readings','lesson_pos','mean_mne','read_mne','mean_hint','read_hint'])       #create the kanji dataframe with column names
    
    #minimum = 100
    #maximum = -100

    for kanji in k:  #go through each kanji to pull out the important data
        
        #print("kanji meaning length" + str(len(da[kanji]['data']['meanings'])))         #Determine the max and min number of meanings for kanji in current lesson
        #if len(da[kanji]['data']['meanings']) < minimum:
        #    minimum = len(da[kanji]['data']['meanings'])
        #if len(da[kanji]['data']['meanings']) > maximum:
        #    maximum = len(da[kanji]['data']['meanings'])
        
        meaning = []
        readings = []
        aux_mean = []
        for item in da[kanji]['data']['meanings']:
            meaning.append(item['meaning'])
        for itm in da[kanji]['data']['readings']:
            if itm['accepted_answer']:
                readings.append(itm['reading'])
        if da[kanji]['data']['auxiliary_meanings']:
            for i in da[kanji]['data']['auxiliary_meanings']:
                if i['type'] == 'whitelist':
                    aux_mean.append(i['meaning'])

        
        kanjid = kanjid.append({'id': da[kanji]['id'],                          #append the important data to the dataframe created above
                                  'object':da[kanji]['object'],
                                  'updated': da[kanji]['data_updated_at'],
                                  'level': da[kanji]['data']['level'],
                                  'url': da[kanji]['data']['document_url'], 
                                  'characters': da[kanji]['data']['characters'],
                                  'meanings':meaning,
                                  'aux_mean': aux_mean,
                                  'readings': readings,
                                  'lesson_pos': da[kanji]['data']['lesson_position'],
                                  'mean_mne': da[kanji]['data']['meaning_mnemonic'],
                                  'read_mne': da[kanji]['data']['reading_mnemonic'],
                                  'mean_hint': da[kanji]['data']['meaning_hint'],
                                  'read_hint': da[kanji]['data']['reading_hint']
                                 },
                       ignore_index=True)

    #print("Kanji \nmax = " + str(maximum) + " minimum = " + str(minimum)) 
    return kanjid

In [4]:
def create_vocabdf(v, da):
    #goal: create the vocab dataframe with all of the vocab for one lesson and clean it up so it can easily be turned into a deck in anki
    #input: v - list of indexes for the vocab in the lesson
    #       da - the data pulled from wanikani
    #output: vocabd - the cleaned up dataframe with all the necessary information needed to create a useful anki deck
    
    vocabd = pd.DataFrame(columns = ['id','object','updated','url','level','characters','meanings','aux_mean','readings',
                                 'lesson_pos','part_sp','mean_mne','read_mne','context_sent_ja1', 'context_sent1_en1', 
                                     'context_sent1_ja2', 'context_sent1_en2', 'context_sent1_ja3', 'context_sent1_en3','audio'])             #create the vobab dataframe with column names
    maximum = -100
    minimum = 100
    for vocab in v:         #go through each vocab to pull out the important data
        
        if len(da[vocab]['data']['auxiliary_meanings']) < minimum:       #Determine the max and min number of meanings for vocab in current lesson
            minimum = len(da[vocab]['data']['auxiliary_meanings'])
        if len(da[vocab]['data']['auxiliary_meanings']) > maximum:
            maximum = len(da[vocab]['data']['auxiliary_meanings'])
        
        meaning = []
        readings = []
        aux_mean = []
        for item in da[vocab]['data']['meanings']:
            meaning.append(item['meaning'])
        for itm in da[vocab]['data']['readings']:
            if itm['accepted_answer']:
                readings.append(itm['reading'])
        if da[vocab]['data']['auxiliary_meanings']:
            for i in da[vocab]['data']['auxiliary_meanings']:
                if i['type'] == 'whitelist':
                    aux_mean.append(i['meaning'])
                
        if len(da[vocab]['data']['context_sentences']) == 1:
                vocabd = vocabd.append({'id': da[vocab]['id'],              #append the important data to the dataframe created above
                              'object':da[vocab]['object'],
                              'updated': da[vocab]['data_updated_at'],
                              'level': da[vocab]['data']['level'],
                              'url': da[vocab]['data']['document_url'], 
                              'characters': da[vocab]['data']['characters'],
                              'meanings': meaning,
                              'aux_mean': aux_mean,
                              'readings': readings,
                              'lesson_pos': da[vocab]['data']['lesson_position'],
                              'part_sp': da[vocab]['data']['parts_of_speech'],
                              'mean_mne': da[vocab]['data']['meaning_mnemonic'],
                              'read_mne': da[vocab]['data']['reading_mnemonic'],
                              'context_sent_ja1': da[vocab]['data']['context_sentences'][0]['ja'],
                              'context_sent_en1': da[vocab]['data']['context_sentences'][0]['en'],
                              'audio': da[vocab]['data']['pronunciation_audios']
                              },
                   ignore_index=True)
        elif len(da[vocab]['data']['context_sentences']) == 2:
            
            vocabd = vocabd.append({'id': da[vocab]['id'],              #append the important data to the dataframe created above
                                  'object':da[vocab]['object'],
                                  'updated': da[vocab]['data_updated_at'],
                                  'level': da[vocab]['data']['level'],
                                  'url': da[vocab]['data']['document_url'], 
                                  'characters': da[vocab]['data']['characters'],
                                  'meanings': meaning,
                                  'aux_mean': aux_mean,
                                  'readings': readings,
                                  'lesson_pos': da[vocab]['data']['lesson_position'],
                                  'part_sp': da[vocab]['data']['parts_of_speech'],
                                  'mean_mne': da[vocab]['data']['meaning_mnemonic'],
                                  'read_mne': da[vocab]['data']['reading_mnemonic'],
                                  'context_sent_ja1': da[vocab]['data']['context_sentences'][0]['ja'],
                                  'context_sent_en1': da[vocab]['data']['context_sentences'][0]['en'],
                                  'context_sent_ja2': da[vocab]['data']['context_sentences'][1]['ja'],
                                  'context_sent_en2': da[vocab]['data']['context_sentences'][1]['en'],
                                  'audio': da[vocab]['data']['pronunciation_audios']
                                  },
                       ignore_index=True)
        else:
            vocabd = vocabd.append({'id': da[vocab]['id'],              #append the important data to the dataframe created above
                                  'object':da[vocab]['object'],
                                  'updated': da[vocab]['data_updated_at'],
                                  'level': da[vocab]['data']['level'],
                                  'url': da[vocab]['data']['document_url'], 
                                  'characters': da[vocab]['data']['characters'],
                                  'meanings': meaning,
                                  'aux_mean': aux_mean,
                                  'readings': readings,
                                  'lesson_pos': da[vocab]['data']['lesson_position'],
                                  'part_sp': da[vocab]['data']['parts_of_speech'],
                                  'mean_mne': da[vocab]['data']['meaning_mnemonic'],
                                  'read_mne': da[vocab]['data']['reading_mnemonic'],
                                  'context_sent_ja1': da[vocab]['data']['context_sentences'][0]['ja'],
                                  'context_sent_en1': da[vocab]['data']['context_sentences'][0]['en'],
                                  'context_sent_ja2': da[vocab]['data']['context_sentences'][1]['ja'],
                                  'context_sent_en2': da[vocab]['data']['context_sentences'][1]['en'],
                                  'context_sent_ja3': da[vocab]['data']['context_sentences'][2]['ja'],
                                  'context_sent_en3': da[vocab]['data']['context_sentences'][2]['ja'],
                                  'audio': da[vocab]['data']['pronunciation_audios']
                                  },
                       ignore_index=True)

    #print("Vocab \nmax = " + str(maximum) + " minimum = " + str(minimum))
    return vocabd#, maximum, minimum
    #print(vocabdf)

In [5]:
def create_radicaldf(r, da):
    #goal: create the radical dataframe with all of the radicals for one lesson and clean it up so it can easily be turned into a deck in anki
    #input: r - list of indexes for the radicals in the lesson
    #       da - the data pulled from wanikani
    #output: radicald - the cleaned up dataframe with all the necessary information needed to create a useful anki deck
    
    radicald = pd.DataFrame(columns = ['id','object','updated','url','level','characters','meanings','lesson_pos',
                                   'mean_mne'])                                                                #create the radical dataframe with column names
    #maximum = -100
    #minimum = 100
    
    for radical in r:           #go through each radical to pull out the important data
        
        #if len(da[radical]['data']['meanings']) < minimum:             #Determine the max and min number of meanings for radicals in current lesson
        #    minimum = len(da[radical]['data']['meanings'])
        #if len(da[radical]['data']['meanings']) > maximum:
        #    maximum = len(da[radical]['data']['meanings'])
        
        if da[radical]['data']['characters'] != None:               #append the important data to the dataframe created above
            radicald = radicald.append({'id': da[radical]['id'],
                                      'object':da[radical]['object'],
                                      'updated': da[radical]['data_updated_at'],
                                      'level': da[radical]['data']['level'],
                                      'url': da[radical]['data']['document_url'], 
                                      'characters': da[radical]['data']['characters'],
                                      'meanings':da[radical]['data']['meanings'][0]['meaning'],
                                      'lesson_pos': da[radical]['data']['lesson_position'],
                                      #'char_img': da[radical]['data']['character_images'][0]['url'],
                                      'mean_mne': da[radical]['data']['meaning_mnemonic']
                                     },
                       ignore_index=True)
        else:
            radicald = radicald.append({'id': da[radical]['id'],
                                      'object':da[radical]['object'],
                                      'updated': da[radical]['data_updated_at'],
                                      'level': da[radical]['data']['level'],
                                      'url': da[radical]['data']['document_url'][0], 
                                      'characters': da[radical]['data']['characters'],
                                      'meanings':da[radical]['data']['meanings'][0]['meaning'],
                                      'lesson_pos': da[radical]['data']['lesson_position'],
                                      'char_img': da[radical]['data']['character_images'][0]['url'],
                                      'mean_mne': da[radical]['data']['meaning_mnemonic']
                                     },
                       ignore_index=True)
        
    #print("Radical \nmax = " + str(maximum) + " minimum = " + str(minimum))
    #print(radicaldf)
    return radicald

In [6]:
def get_data(level):
    #goal: pull the data for the selected level from the wanikani api
    #input: level - the desired level to pull the data from
    #output: dja - the data from the reuest returned in an easy way to extract the desired information
    
    response = requests.get("https://api.wanikani.com/v2/subjects?levels="+ str(level), headers = {"Authorization": "Bearer c61d70a7-c134-4b4a-bb05-f97a7c59af8b"})
    dj = response.json()    #retrieve the json from the website
    dja = dj['data']     #use only the data returned 
    
    return dja
    

In [7]:
levelst = 1             #level to start pulling data from
levele = 60             #level to stop pulling data from
kanjidf = pd.DataFrame(columns = ['id','object','updated','url','level','characters','meanings','aux_mean',
                                 'readings','lesson_pos','mean_mne','read_mne','mean_hint','read_hint'])       #create the kanji dataframe with column names
vocabdf = pd.DataFrame(columns = ['id','object','updated','url','level','characters','meanings','aux_mean','readings',
                                 'lesson_pos','part_sp','mean_mne','read_mne','context_sent_ja1', 'context_sent_en1', 
                                     'context_sent_ja2', 'context_sent_en2', 'context_sent_ja3', 'context_sent_en3','audio'])             #create the vobab dataframe with column names
radicaldf = pd.DataFrame(columns = ['id','object','updated','url','level','characters','meanings','lesson_pos', 'char_img',
                                   'mean_mne'])                                                                #create the radical dataframe with column names
#kmax = -100
#kmin = 100
#vmax = -100
#vmin = 100
#rmax = -100
#rmin = 100
for lv in range(levelst,levele+1):      #go through the levels
    print(lv)                           #track the progress of the loop
    
    data = get_data(lv)                 #call the function to get the data for the desired level
    kan, rad, voc = det_type(data)      #determine the type of each element(kanji, radical, vocabulary)
    
    #tempkan, kanmaxi, kanmini = create_kanjidf(kan, data)       #temporary dataframe to help in analyzing the data to be used for the anki port
    #if kmax < kanmaxi:
    #    kmax = kanmaxi
    #if kmin > kanmini:
    #    kmin = kanmini
    #tempvoc, vocmaxi, vocmini = create_vocabdf(voc, data)
    #if vmax < vocmaxi:
    #    vmax = vocmaxi
    #if vmin > vocmini:
    #    vmin = vocmini
    #temprad, radmaxi, radmini = create_radicaldf(rad, data)
    #if rmax < radmaxi:
    #    rmax = radmaxi
    #if rmin > radmini:
    #    rmin = radmini
    kanjidf = kanjidf.append(create_kanjidf(kan,data))         #call the function to get the data for the kanji dataframe for the current level and append it to the previous levels
    vocabdf = vocabdf.append(create_vocabdf(voc, data))        #call the function to get the data for the vocab dataframe for the current level and append it to the previous levels
    radicaldf = radicaldf.append(create_radicaldf(rad, data))   #call the function to get the data for the radical dataframe for the current level and append it to the previous levels
    
#print(#"\nkanji\nmax = " + str(kmax) + " min = " + str(kmin) +              max = 5 min = 1
#      "\nvocab\nmax = " + str(vmax) + " min = " + str(vmin))# +              max = 9 min = 1
#      "\nradical\nmax = " + str(rmax) + " min = " + str(rmin))             max = 1 min = 1

1


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60


In [8]:
kanjidf.to_csv('kanjidf.csv')
vocabdf.to_csv('vocabdf.csv')
radicaldf.to_csv('radicaldf.csv')

In [9]:
#print(radicaldf)
#print(vocabdf['aux_mean']) #fix separate urls in audio
#print(kanjidf) #meanings, aux_mean, readings

In [10]:
#code to pull images from the wanikani website
#file = requests.get('https://cdn.wanikani.com/audios/27959-subject-2467.ogg?1553788696', headers = {"Authorization": "Bearer c61d70a7-c134-4b4a-bb05-f97a7c59af8b"})
#with open('/Users/peace/Downloads/audio.mp3', 'wb') as f:
#    f.write(file.content)

In [11]:
#dt = get_data(1)
#k,r,v = det_type(dt)
#raddf = pd.DataFrame(columns = ['id','object','updated','url','level','characters','meanings','lesson_pos', 'char_img',
#                                   'mean_mne'])                                                                #create the radical dataframe with column names
#for item in r:
#    print(1)
#    print(dt[item]['data']['character_images'])
#    print(dt[item]['data']['characters'])
#    if dt[item]['data']['meanings']['primary'] == True:
#    print(dt[item]['data']['meanings'])
#    print(dt[item]['data'])
#raddf = raddf.append(create_radicaldf(r, dt))

In [12]:
#for lv in range(levelst,levele+1):
#    response = requests.get("https://api.wanikani.com/v2/subjects?levels="+ str(lv), headers = {"Authorization": "Bearer c61d70a7-c134-4b4a-bb05-f97a7c59af8b"})
#    dj = response.json()    #retrieve the json from the website
#    dja = dj['data']     #use only the data returned 
#    print(dja[0]['data']['level'])