# Vocab Analysis
## Section 3: Analyze the Data

In [1]:
graphs_on = False

### 1. Import necessary libraries

In [2]:
# %load da_vocab_utility.py
# TDD backbone assertion to confirm a function call returns the desired result
def assertEquals(actual, expected, desc):
    assert(actual==expected), desc + " result: " + str(actual) + ", expected: " + str(expected)
    return "OK"
	
# check that two lists have the same contents
def lists_equal(a,b):
    return (a == b).all()
	

# shallow check (by row) for duplicates
def has_dupes(df_in):
    dupe = df_in.duplicated()
    return df_in.loc[dupe].shape[0] != 0
	
def print_line_break():
    print("-"*75)
	
def print_before_after(b, a, t=""):
    if t != "":
        print_line_break()
        print(t)
    print_line_break()
    print("Before: " + str(b))
    print_line_break()
    print("After: " + str(a))
    print_line_break()
	
def time_it(func, *args, **kwargs):
    start = time.time()
    func(*args, **kwargs)
    end = time.time()
    # https://stackoverflow.com/questions/8885663/how-to-format-a-floating-number-to-fixed-width-in-python
    print("{:.0f}".format((end - start)*1000) + " miliseconds")
	
def has_dupe_terms(df_in):
    location = df_in['Term'].duplicated()
    return df_in.loc[location].shape[0] != 0
	
def get_rows_by_value_in_col(df_in, value, col):
    return df_in.loc[df_in[col]==value]
	
# Converts a tag string to a list to a set back to a string (this removes the duplicates)
def remove_dupes(t):
    temp = list(set(t.lower().split()))
    return ' '.join(temp) # return as string
	
# determines if an individual tag substring exists in a larger tags list string
def tag_exists(tags, tag):
    return 1 if tag in tags.split() else 0
	
def is_blank (s):
    return not (s and s.strip())
	
def get_frame_of_cards_by_term(df, t):
    return df.loc[df['Term']==t]
	

In [3]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

In [4]:
# show all columns
pd.options.display.max_columns = None

In [5]:
image_cnt = 0

In [6]:
def save_fig_a(x):
    plt.savefig("analyze_" + str(x)+".png")

### 2. Import necessary datasets

In [7]:
# https://stackoverflow.com/questions/36977223/how-should-i-read-a-csv-file-without-the-unnamed-row-with-pandas?noredirect=1&lq=1
# import notes
notes_location = "datasets/df_notes_020_final_section_2.csv"
df_notes = pd.read_csv(notes_location, index_col=[0])

# import cards
cards_location = "datasets/df_cards_012_mid_section_2.csv"
df_cards = pd.read_csv(cards_location, index_col=[0])

# todo: import revlog

In [8]:
df_notes.head()

Unnamed: 0,nid,tags,Term,Yomi1,NoteCreated,LastModified,commonword,clothing,animal,body,food,place,textbook,college,fromdict,fromexam,n1,n2,n3,n4,n5,katakana,hiragana,kanji,adv,adj,noun,verb,nonconvo,convo,metalite,hasSimilarSound,hasSameSound,hasVisual,hasAudio,hasMultiMeaning,hasMultiReading,hasSimilarMeaning,hasAltForm,hasRichExamples,TermLen,Syllables,TermLenGroup,SyllablesGroup,jlpt_lvl_d,script,c_suff_reviewed,mean_ivl,mean_factor,mean_reps,mean_lapses,c_suff_reviewed_x,total_reps,total_lapses,hasListenCard,hasPictureCard,hasReadCard,hasTranslateCard,c_suff_reviewed_y,mean_note_waste,mean_note_roi,n_ivl_q,n_factor_q,n_waste_q,n_roi_q,no_waste,analysis_cat
0,1331799797114,commonword noun kanji suruverb fromdict,移籍,いせき,2012-03-15 08:23:17.114,2019-06-09 23:34:05.000,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,[2],[3:4],,kanji,1.0,99.0,1980.0,7.0,0.0,1.0,7,0,0,0,1,0,1.0,0.0,14.142857,0,2,0,1,1,
4,1331799797126,kanji fromdict,有能,ゆうのう,2012-03-15 08:23:17.126,2019-05-27 20:00:11.000,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,4,[2],[3:4],,kanji,1.0,248.0,2130.0,9.0,0.0,1.0,9,0,0,0,1,0,1.0,0.0,27.555556,2,2,0,3,1,
5,1331799797127,transportation noun travel mixedscript haskanj...,公衆トイレ,こうしゅうトイレ,2012-03-15 08:23:17.127,2019-05-27 20:59:39.000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,5,8,[5:8],[5:8],,,1.0,229.0,2270.0,9.0,0.0,1.0,9,0,0,0,1,0,1.0,0.0,25.444444,2,2,0,2,1,
6,1331799797128,kanji fromdict,送り賃,おくりちん,2012-03-15 08:23:17.128,2019-05-18 12:54:16.000,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,5,[3:4],[5:8],,kanji,1.0,178.0,2120.0,8.0,0.0,1.0,8,0,0,0,1,0,1.0,0.0,22.25,1,2,0,2,1,
7,1331799797130,technical kanji fromdict noun,量子物理学,りょうしぶつりがく,2012-03-15 08:23:17.130,2019-05-28 00:40:16.000,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5,9,[5:8],[9: ],,kanji,1.0,204.0,2270.0,7.0,0.0,1.0,7,0,0,0,1,0,1.0,0.0,29.142857,1,2,0,3,1,


In [9]:
print(df_notes.TermLenGroup.unique())
print(df_notes.SyllablesGroup.unique())
print(df_notes.script.unique())
print(df_notes.analysis_cat.unique())

['[2]' '[5:8]' '[3:4]' '[1]' '[9: ]']
['[3:4]' '[5:8]' '[9: ]' '[2]' '[1]']
['kanji' nan 'katakana' 'hiragana']
[nan 'sticky' 'slippery']


In [10]:
df_cards.head()

Unnamed: 0,cid,nid,ivl,factor,reps,lapses,CardCreated,DueDate,c_ivl_q,c_factor_q,CardType_listen,CardType_look,CardType_read,CardType_recall,cardtype,waste,roi,c_suff_reviewed
3,1331799797114,1331799797114,99,1980,7,0,2012-03-15 08:23:17.114,2015-02-04 09:00:00.000,0,1,0,0,1,0,read,0.142857,14.142857,1.0
8,1331799797122,1331799797122,224,2130,5,0,2012-03-15 08:23:17.122,2015-07-04 09:00:00.000,2,2,0,0,1,0,read,0.2,44.8,1.0
9,1331799797125,1331799797125,291,1930,18,1,2012-03-15 08:23:17.125,2016-01-30 09:00:00.000,2,1,0,0,1,0,read,0.111111,16.166667,1.0
10,1331799797126,1331799797126,248,2130,9,0,2012-03-15 08:23:17.126,2015-09-04 09:00:00.000,2,2,0,0,1,0,read,0.111111,27.555556,1.0
11,1331799797127,1331799797127,229,2270,9,0,2012-03-15 08:23:17.127,2015-06-11 09:00:00.000,2,2,0,0,1,0,read,0.111111,25.444444,1.0


### 3. Observe Metadata (tag) Frequency:

In [11]:
tag_freq = pd.Series(' '.join(df_notes.tags).split()).value_counts()

In [12]:
tag_freq.head(20)

kanji                   3932
textbook                1662
fromdict                1228
metalite                1042
verb                     837
fromtest                 836
commonword               488
noun                     402
hasrobo                  313
fromexam                 305
media                    297
hiragana                 260
checked                  220
n3                       207
addsimilar               196
numeric                  193
usuallywritteninkana     188
katakana                 187
convo                    153
transitive               147
dtype: int64

In [13]:
tag_freq.head(60)[20:]

college         146
music           133
lyrics          122
intransitive    107
iadj            106
adj             106
place           106
n2              106
counter         102
n4               99
technical        85
multimeaning     85
n1               81
hassame          79
n5               78
semester1        75
noadjective      66
multireading     61
suruverb         56
hassimilar       53
body             52
animal           52
name             49
geography        49
culture          49
type5r           48
food             47
haskanji         46
adv              44
japan1st         37
people           36
gairaigo         35
multiwriting     33
seenbynative     32
onomatopoeic     31
ghibli           30
multiterm        29
magazine         29
suffix           27
abbr             27
dtype: int64

In [14]:
# Inspect tags that have been used only sparingly
tag_freq.tail(30)

cooking              1
hazard               1
nojishoentry         1
frommagazine         1
multiform            1
dialect              1
checksimilar         1
bathroom             1
verbscompoundpast    1
toy                  1
nounsuffix           1
tradition            1
checkhint            1
hospital             1
category             1
familiar             1
challenging          1
seafood              1
statistics           1
grammarcontext       1
dailylife            1
casual               1
fish                 1
common               1
emergency            1
notinjapan           1
space                1
vivid                1
hassamemeaning       1
position             1
dtype: int64

# Initial Observations

Looks like our data is ready for some proper inspection! What are some questions that we might ask of this dataset? We could start with some simple/basic broad/overview observations about the (condensed) dataset such as:
- How many terms (unique notes) exist?
- How many study vectors (unique card types) exist (were utilized by student A)?
- When did student A first start studying?
- What is the data distribution for reps count? For laspes count?
- Of the terms that exist, how many had audio data?
- Of the terms that exist, how many had image data?

In [15]:
# unique terms in the condensed dataset
len(df_notes['Term'].unique())

5471

In [16]:
# confirm what card types exist
df_cards['cardtype'].value_counts()

read      6304
look       674
listen      39
recall       7
Name: cardtype, dtype: int64

In [17]:
#pd_crt # datetime of collection creation (studying commenced from this date)

In [18]:
print(df_cards.shape)

(7024, 18)


In [19]:
print(df_notes.shape)

(5471, 67)


# Define features

In [20]:
#components of binary list (cards)
type_list = ['CardType_listen','CardType_look','CardType_read','CardType_recall']

In [21]:
#components of binary list (notes, combo)
genre_list = ['clothing','animal','body','food','place']

source_list = ['fromdict','fromexam','textbook','college']

convo_list = ['convo','nonconvo']

jlpt_list = ['n1','n2','n3','n4','n5']

pos_list = ['noun','verb','adj','adv']

char_list = ['katakana','hiragana','kanji']

has_list = ['hasVisual','hasAudio','hasMultiMeaning','hasMultiReading','hasSimilarSound','hasSameSound',
    'hasSimilarMeaning','hasAltForm','hasRichExamples']

card_list = ['hasListenCard','hasPictureCard','hasReadCard','hasTranslateCard']

other_list = ['commonword','metalite']

In [22]:
#components of continuous list
len_list = ['TermLen','Syllables']

study_data_list = ['mean_ivl','mean_factor','mean_reps','mean_lapses',
                   'total_reps','total_lapses']

In [23]:
binary_list = list(genre_list + source_list + jlpt_list +
    pos_list + char_list + has_list + other_list + convo_list) #card_list

continuous_list = list(len_list + study_data_list)

discrete_non_binary_list = ['NoteCreated','LastModified','TermLenGroup','SyllablesGroup','jlpt_lvl_d']

In [24]:
numeric_note_fields = ['mean_ivl','mean_factor','mean_reps','mean_lapses',
                       'total_reps','total_lapses']

mean_card_fields = ['mean_card_waste','mean_card_roi']

In [25]:
note_quintiles = ['n_ivl_q','n_factor_q','n_waste_q','n_roi_q']

In [26]:
#show correlation of stats via heatmap
df_cards_001_corr = df_cards.copy()
df_cards_001_corr = df_cards_001_corr.drop(["cid","nid",'c_ivl_q','c_factor_q','ivl','factor','reps','lapses'], axis=1)

# Inspect card correlations visually

In [27]:
if graphs_on:
    corr_card = df_cards_001_corr.corr()
    fig, ax = plt.subplots(figsize=(8,8))
    ax_card = sns.heatmap(corr_card, vmin=-1, cmap="YlGnBu", annot=True)
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1

# Inspect note correlations visually

In [28]:
#show correlation of stats via heatmap
df_notes_001_corr = df_notes.copy()
df_notes_001_corr = df_notes_001_corr.drop(list(
    ["nid",'jlpt_lvl_d','no_waste'] +
    card_list + convo_list + numeric_note_fields + genre_list + 
    source_list + jlpt_list + char_list + has_list + 
    other_list + len_list + note_quintiles), axis=1)

In [29]:
# Inspect the correlation between word type & mean note ROI & mean note waste
if graphs_on:
    corr = df_notes_001_corr.corr()
    fig, ax = plt.subplots(figsize=(8,8))
    ax = sns.heatmap(corr, vmin=-1, cmap="YlGnBu", annot=True)
    # save image out
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1

In [30]:
df_notes_002_corr = df_notes.copy()
df_notes_002_corr = df_notes_002_corr.drop(list(
    ["nid","total_reps","total_lapses",'no_waste'] + 
    numeric_note_fields + source_list + convo_list + pos_list + len_list + other_list + 
    has_list + genre_list + jlpt_list + card_list + note_quintiles + ['jlpt_lvl_d','mean_factor']), axis=1)

# Inspect correlations by script

In [31]:
#show correlation of stats via heatmap
if graphs_on:
    corr2 = df_notes_002_corr.corr()
    fig, ax = plt.subplots(figsize=(8,8))
    ax2 = sns.heatmap(corr2, vmin=-1, cmap="YlGnBu", annot=True)
    # save image out
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1

In [32]:
df_notes_003_corr = df_notes.copy()
df_notes_003_corr = df_notes_003_corr.drop(list(
    ["nid","total_reps","total_lapses",'jlpt_lvl_d','mean_factor','no_waste'] + 
    numeric_note_fields + convo_list + char_list + pos_list + len_list + other_list + 
    has_list + genre_list + jlpt_list + card_list + note_quintiles), axis=1)

# Inspect correlations by word first encounter

In [33]:
#show correlation of stats via heatmap
if graphs_on:
    corr3 = df_notes_003_corr.corr()
    fig, ax = plt.subplots(figsize=(8,8))
    ax3 = sns.heatmap(corr3, vmin=-1, cmap="YlGnBu", annot=True)
    # save image out
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1

In [34]:
df_notes_004_corr = df_notes.copy()
df_notes_004_corr = df_notes_004_corr.drop(list(
    ["nid","total_reps","total_lapses",'jlpt_lvl_d','mean_factor',
     'commonword','no_waste'] + numeric_note_fields + char_list + source_list + 
    pos_list + len_list + has_list + genre_list + jlpt_list + card_list + note_quintiles), axis=1)

# Inspect correlation of convo ok'd vs not, & metadata poorness

In [35]:
#show correlation of stats via heatmap
if graphs_on:
    corr4 = df_notes_004_corr.corr()
    fig, ax = plt.subplots(figsize=(8,8))
    ax4 = sns.heatmap(corr4, vmin=-1, cmap="YlGnBu", annot=True)
    # save image out
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1

# Inspect field data correlations

In [36]:
df_notes_005_corr = df_notes.copy()
df_notes_005_corr = df_notes_005_corr.drop(list(
    ["nid","total_reps","total_lapses",'jlpt_lvl_d','mean_factor',
     'metalite','no_waste']+other_list+convo_list+char_list+numeric_note_fields+
    source_list+pos_list+len_list+genre_list+jlpt_list+card_list + note_quintiles), axis=1)

In [37]:
#show correlation of stats via heatmap
if graphs_on:
    corr5 = df_notes_005_corr.corr()
    fig, ax = plt.subplots(figsize=(8,8))
    ax5 = sns.heatmap(corr5, vmin=-1, cmap="YlGnBu", annot=True)
    # save image out
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1

In [38]:
df_cards_002_describe = df_cards.copy()
df_cards_002_describe = df_cards_002_describe.drop(list(["cid","nid",'c_ivl_q','c_factor_q']+type_list), axis=1)

df_cards_002_describe.describe()

Unnamed: 0,ivl,factor,reps,lapses,waste,roi,c_suff_reviewed
count,7024.0,7024.0,7024.0,7024.0,7024.0,7024.0,7024.0
mean,328.458144,1712.236902,15.954869,0.653189,0.106348,27.550559,1.0
std,286.170879,386.865972,9.231241,1.21884,0.035868,25.132115,0.0
min,1.0,1300.0,5.0,0.0,0.033333,0.00885,1.0
25%,162.0,1300.0,10.0,0.0,0.076923,9.454545,1.0
50%,235.0,1639.5,14.0,0.0,0.1,19.230769,1.0
75%,402.0,2050.0,19.0,1.0,0.125,38.511364,1.0
max,2148.0,2710.0,113.0,16.0,0.210526,116.090909,1.0


In [39]:
df_cards_002_describe.corr()

Unnamed: 0,ivl,factor,reps,lapses,waste,roi,c_suff_reviewed
ivl,1.0,0.232198,-0.172093,-0.215334,-0.129463,0.820618,
factor,0.232198,1.0,-0.651075,-0.404378,0.465315,0.587608,
reps,-0.172093,-0.651075,1.0,0.862316,-0.131623,-0.478978,
lapses,-0.215334,-0.404378,0.862316,1.0,0.325635,-0.38998,
waste,-0.129463,0.465315,-0.131623,0.325635,1.0,0.134688,
roi,0.820618,0.587608,-0.478978,-0.38998,0.134688,1.0,
c_suff_reviewed,,,,,,,


# Plot ROI trends for cards using linear regression

In [40]:
import numpy as np
import statsmodels.formula.api as smf

In [41]:
#OLS is Ordinary Least Squares, the most common type of linear regression
#the fit function uses the predictive values to calculate the best linear regression line
result = smf.ols('roi ~ ivl + factor - 1', data=df_cards).fit()

In [42]:
result.summary()

0,1,2,3
Dep. Variable:,roi,R-squared:,0.872
Model:,OLS,Adj. R-squared:,0.872
Method:,Least Squares,F-statistic:,23820.0
Date:,"Mon, 10 Jun 2019",Prob (F-statistic):,0.0
Time:,02:07:44,Log-Likelihood:,-28177.0
No. Observations:,7024,AIC:,56360.0
Df Residuals:,7022,BIC:,56370.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ivl,0.0626,0.001,109.369,0.000,0.062,0.064
factor,0.0052,0.000,36.621,0.000,0.005,0.005

0,1,2,3
Omnibus:,2104.296,Durbin-Watson:,1.688
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8400.759
Skew:,1.437,Prob(JB):,0.0
Kurtosis:,7.521,Cond. No.,6.54


# Plot ROI trends for notes using linear regression (using everything)

In [43]:
#OLS is Ordinary Least Squares, the most common type of linear regression
#the fit function uses the predictive values to calculate the best linear regression line
result = smf.ols('mean_note_roi ~ Syllables + TermLen + mean_reps + total_reps - 1', data=df_notes).fit()

In [44]:
result.summary()

0,1,2,3
Dep. Variable:,mean_note_roi,R-squared:,0.58
Model:,OLS,Adj. R-squared:,0.58
Method:,Least Squares,F-statistic:,1891.0
Date:,"Mon, 10 Jun 2019",Prob (F-statistic):,0.0
Time:,02:07:44,Log-Likelihood:,-23683.0
No. Observations:,5471,AIC:,47370.0
Df Residuals:,5467,BIC:,47400.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Syllables,2.3382,0.188,12.407,0.000,1.969,2.708
TermLen,5.1002,0.282,18.072,0.000,4.547,5.653
mean_reps,-0.5940,0.050,-11.772,0.000,-0.693,-0.495
total_reps,0.3114,0.039,7.905,0.000,0.234,0.389

0,1,2,3
Omnibus:,364.448,Durbin-Watson:,1.721
Prob(Omnibus):,0.0,Jarque-Bera (JB):,623.368
Skew:,0.509,Prob(JB):,4.34e-136
Kurtosis:,4.303,Cond. No.,37.9


# Plot ROI trends for notes using linear regression

In [45]:
#OLS is Ordinary Least Squares, the most common type of linear regression
#the fit function uses the predictive values to calculate the best linear regression line
result = smf.ols('mean_note_roi ~ Syllables + TermLen - 1', data=df_notes).fit()

In [46]:
result.summary()

0,1,2,3
Dep. Variable:,mean_note_roi,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.567
Method:,Least Squares,F-statistic:,3587.0
Date:,"Mon, 10 Jun 2019",Prob (F-statistic):,0.0
Time:,02:07:44,Log-Likelihood:,-23767.0
No. Observations:,5471,AIC:,47540.0
Df Residuals:,5469,BIC:,47550.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Syllables,1.7058,0.181,9.403,0.000,1.350,2.061
TermLen,4.8114,0.285,16.876,0.000,4.252,5.370

0,1,2,3
Omnibus:,406.708,Durbin-Watson:,1.735
Prob(Omnibus):,0.0,Jarque-Bera (JB):,517.321
Skew:,0.678,Prob(JB):,4.63e-113
Kurtosis:,3.657,Cond. No.,7.05


# Plot waste trends for notes using linear regression

In [47]:
#OLS is Ordinary Least Squares, the most common type of linear regression
#the fit function uses the predictive values to calculate the best linear regression line
result = smf.ols('mean_note_waste ~ mean_ivl + mean_factor - 1', data=df_notes).fit()

In [48]:
result.summary()

0,1,2,3
Dep. Variable:,mean_note_waste,R-squared:,0.281
Model:,OLS,Adj. R-squared:,0.281
Method:,Least Squares,F-statistic:,1071.0
Date:,"Mon, 10 Jun 2019",Prob (F-statistic):,0.0
Time:,02:07:44,Log-Likelihood:,10083.0
No. Observations:,5471,AIC:,-20160.0
Df Residuals:,5469,BIC:,-20150.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
mean_ivl,-2.492e-05,2.64e-06,-9.429,0.000,-3.01e-05,-1.97e-05
mean_factor,1.809e-05,5.5e-07,32.881,0.000,1.7e-05,1.92e-05

0,1,2,3
Omnibus:,609.617,Durbin-Watson:,1.884
Prob(Omnibus):,0.0,Jarque-Bera (JB):,777.624
Skew:,0.902,Prob(JB):,1.38e-169
Kurtosis:,2.608,Cond. No.,8.96


**reps** = work done to remember a card  
**interval** = memory length as output of memorization work done  
**ease/factor** = indicator of effort to retreive & store memory  
**lapses** = result of memory deficit, a common side-effect & indicator of inefficiency of memorization efforts  

**lapses/reps ratio** (waste ratio) => the closer to 0, the better ("low waste"). the higher this is, the worse : "high waste"  
**interval/reps ratio** (ROI ratio) = the higher the better ("low effort" / "sticky"). the lower this is, the worse ("high effort", "slippery")  

In [49]:
df_notes_006_corr = df_notes.copy()
df_notes_006_corr = df_notes_006_corr.drop(list(binary_list + card_list + ['nid','jlpt_lvl_d']), axis=1)

In [50]:
df_notes_006_corr.describe()

Unnamed: 0,TermLen,Syllables,c_suff_reviewed,mean_ivl,mean_factor,mean_reps,mean_lapses,c_suff_reviewed_x,total_reps,total_lapses,c_suff_reviewed_y,mean_note_waste,mean_note_roi,n_ivl_q,n_factor_q,n_waste_q,n_roi_q,no_waste
count,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0,5471.0
mean,2.629318,4.195028,1.0,290.752391,1667.835618,16.42378,0.638884,1.0,18.480351,0.747395,1.1104,0.026551,22.46262,1.9172,0.912265,0.0,1.849936,0.608847
std,1.178436,1.704865,0.0,203.692803,354.165101,8.436636,1.067492,0.0,11.527311,1.257439,0.319196,0.036583,17.278964,1.379498,0.792548,0.0,1.352079,0.488053
min,1.0,1.0,1.0,1.0,1300.0,7.0,0.0,1.0,7.0,0.0,1.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.0
25%,2.0,3.0,1.0,163.0,1300.0,11.0,0.0,1.0,11.0,0.0,1.0,0.0,9.377976,1.0,0.0,0.0,1.0,0.0
50%,2.0,4.0,1.0,230.0,1600.0,14.0,0.0,1.0,15.0,0.0,1.0,0.0,17.4375,2.0,1.0,0.0,2.0,1.0
75%,3.0,5.0,1.0,374.5,1947.0,19.0,1.0,1.0,22.0,1.0,1.0,0.055556,32.121429,3.0,2.0,0.0,3.0,1.0
max,15.0,30.0,1.0,1749.0,2650.0,81.0,10.0,1.0,105.0,13.0,3.0,0.138462,77.583333,4.0,2.0,0.0,4.0,1.0


In [51]:
df_notes_006_corr.corr()

Unnamed: 0,TermLen,Syllables,c_suff_reviewed,mean_ivl,mean_factor,mean_reps,mean_lapses,c_suff_reviewed_x,total_reps,total_lapses,c_suff_reviewed_y,mean_note_waste,mean_note_roi,n_ivl_q,n_factor_q,n_waste_q,n_roi_q,no_waste
TermLen,1.0,0.690789,,0.057139,0.064242,-0.111862,-0.101826,,-0.092871,-0.095772,-0.013661,-0.09154,0.101373,0.079516,0.071068,,0.105291,0.093369
Syllables,0.690789,1.0,,-0.012037,0.028122,-0.068392,-0.064827,,-0.058145,-0.064425,-0.016393,-0.05546,0.016678,-0.000595,0.03269,,0.025309,0.051272
c_suff_reviewed,,,,,,,,,,,,,,,,,,
mean_ivl,0.057139,-0.012037,,1.0,0.272481,-0.205835,-0.236627,,-0.033469,-0.173074,0.249497,-0.232346,0.828531,0.856516,0.286458,,0.770904,0.182931
mean_factor,0.064242,0.028122,,0.272481,1.0,-0.623813,-0.384894,,-0.507718,-0.355993,-0.038571,-0.328943,0.620882,0.391021,0.935443,,0.643384,0.364352
mean_reps,-0.111862,-0.068392,,-0.205835,-0.623813,1.0,0.863008,,0.837337,0.811224,0.090394,0.657237,-0.536322,-0.350183,-0.626221,,-0.615051,-0.637113
mean_lapses,-0.101826,-0.064827,,-0.236627,-0.384894,0.863008,1.0,,0.737426,0.943051,0.11148,0.879332,-0.453439,-0.361457,-0.396082,,-0.533666,-0.746754
c_suff_reviewed_x,,,,,,,,,,,,,,,,,,
total_reps,-0.092871,-0.058145,,-0.033469,-0.507718,0.837337,0.737426,,1.0,0.854085,0.581956,0.608875,-0.381869,-0.169807,-0.492228,,-0.442506,-0.642494
total_lapses,-0.095772,-0.064425,,-0.173074,-0.355993,0.811224,0.943051,,0.854085,1.0,0.344603,0.849807,-0.411947,-0.290925,-0.358493,,-0.480264,-0.741624


In [52]:
get_rows_by_value_in_col(df_notes, 1523892839900, 'nid')

Unnamed: 0,nid,tags,Term,Yomi1,NoteCreated,LastModified,commonword,clothing,animal,body,food,place,textbook,college,fromdict,fromexam,n1,n2,n3,n4,n5,katakana,hiragana,kanji,adv,adj,noun,verb,nonconvo,convo,metalite,hasSimilarSound,hasSameSound,hasVisual,hasAudio,hasMultiMeaning,hasMultiReading,hasSimilarMeaning,hasAltForm,hasRichExamples,TermLen,Syllables,TermLenGroup,SyllablesGroup,jlpt_lvl_d,script,c_suff_reviewed,mean_ivl,mean_factor,mean_reps,mean_lapses,c_suff_reviewed_x,total_reps,total_lapses,hasListenCard,hasPictureCard,hasReadCard,hasTranslateCard,c_suff_reviewed_y,mean_note_waste,mean_note_roi,n_ivl_q,n_factor_q,n_waste_q,n_roi_q,no_waste,analysis_cat


In [53]:
get_rows_by_value_in_col(df_cards, 1523892839900, 'nid')

Unnamed: 0,cid,nid,ivl,factor,reps,lapses,CardCreated,DueDate,c_ivl_q,c_factor_q,CardType_listen,CardType_look,CardType_read,CardType_recall,cardtype,waste,roi,c_suff_reviewed


In [54]:
df_notes_n_corr = df_notes.copy()
df_notes_n_corr = df_notes_n_corr.drop(list(binary_list + card_list + 
    ['nid','tags','Term','Yomi1','jlpt_lvl_d','total_reps','total_lapses',
     'n_ivl_q', 'n_factor_q', 'n_waste_q', 'n_roi_q', 'no_waste']), axis=1)

In [55]:
df_notes_n_corr.columns.values

array(['NoteCreated', 'LastModified', 'TermLen', 'Syllables',
       'TermLenGroup', 'SyllablesGroup', 'script', 'c_suff_reviewed',
       'mean_ivl', 'mean_factor', 'mean_reps', 'mean_lapses',
       'c_suff_reviewed_x', 'c_suff_reviewed_y', 'mean_note_waste',
       'mean_note_roi', 'analysis_cat'], dtype=object)

In [56]:
# Basic correlogram
if graphs_on:
    g = sns.pairplot(df_notes_n_corr)
    g.fig.suptitle("Note Correlogram", y=1.05, fontsize=24)
    # save image out
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1
    plt.show()

In [57]:
df_cards_n_corr = df_cards.copy()

df_cards_n_corr = df_cards_n_corr.drop(['cid','nid','CardType_listen','DueDate',
    'CardType_look','CardType_read','CardType_recall','c_ivl_q', 'c_factor_q'], axis=1)

df_cards_n_corr.columns.values

array(['ivl', 'factor', 'reps', 'lapses', 'CardCreated', 'cardtype',
       'waste', 'roi', 'c_suff_reviewed'], dtype=object)

In [58]:
# Basic correlogram
if graphs_on:
    g = sns.pairplot(df_cards_n_corr)
    g.fig.suptitle("Card Correlogram", y=1.05, fontsize=24)
    # save image out
    save_fig_a(image_cnt)
    image_cnt = image_cnt + 1
    plt.show()

# Intitial Analysis

There appears to be a linear relationship between lapses & reps. It appears that lapses incur a cost of increasing reps. However, this info isn't directly actionable - one cannot simply just 'not forget'. The primary focus is what can be done to minimize lapses while not sacrificing efficiency (long intervals for few reps, and lapses as close as possible to 0).

# Topical Analysis

After doing some basic assessments of the data, we can dig a bit deeper:
- Is there a correlation between words having multiple readings ("yomi") and their forget rate\*?
- Is there a correlation between words having same/similar sounding words and their forget rate\*?
- What might the effect of word length be on memorability? \*\*, \*\*\*

> \* Forget rate can be understand as a multitude of things, such as the ratio between lapses & reps, as well as the raw lapse count, the average interval, and other numbers/ratios to be determined. I will attempt to clarify this in the process.  
\*\* Memorability being loosely correlated with forget rate, where memorability could be understood as a word/term's intrinsic "stickiness" in the brain, as opposed to an individual or collective's capacity to keep words/terms in their head. Sources pending.  
\*\*\* A huge caveat here being that, this dataset has a sample size of 1 (for both student and language), so all observations, interpretations, and understandings must be taken with more than a few grains of salt (and tested further with larger sample sizes, of at least 200 students, and 5 or more languages).

# Further Analysis

For a deeper understanding of what it means to aquire new terminology, the researcher believes it best to conduct analysis on term acquisition by merging multiple vectors (individual cards) of a single term into single entries, where dummy values for each vector (such as review count, lapse count, etc.) are encoded per entry. This would enable inspection and correlation analysis of:
- total reviews per term
- average ratio of reviews per term per vector (look vs hear vs recall vs read)
- where lapses are most likely to occur (per word, per vector, etc.)
- how word length, presence of kanji, katakana, hirgana, or combination thereof, may affect the above counts & ratios

# Further Information

The Spaced Repetition Software (\"SRS\") used for the study of Japanese by student \"A\" is an open souce program called Anki. The algorithm used by it to \"graduate\" (also refered to as \"maturing\") study items (called cards) so that subsequent reviews/practices will be spaced into the future is referred to as SM-2. [Please click here for more information on the SM-2 algorithm used in Anki.]("https://apps.ankiweb.net/docs/manual.html#what-algorithm")