In [1]:
# For downloading audio from tatoeba
# Clean CSV sentences/translations file

import requests
import random
import csv

# We don't want duplicate cards, so removes them with this function
def remove_tags(duplicates):                                     
    clean_tags = list()              # Array that stores tags without duplicates
    for tag in duplicates:           # Cycle through each tag
        if tag not in clean_tags:    # If a duplicate is found, it is not added to new array
            clean_tags.append(tag)   # Add unique tags to array
    return clean_tags

def remove_sentences(lines):
    seen  = set()
    block = [row for row in lines if row[0] not in seen and not seen.add(row[0])]
    return block

with open('Dutch tabbed.csv', 'r', encoding='utf-8-sig') as readFile: # Open the CSV file with translations
    reader = csv.reader(readFile, delimiter='\t')                     # Read the file, set delimiter as 'tab'
    lines  = list(reader)                                               # Save contents as list 'lines'

random.shuffle(lines)

tags = [lines[i][0] for i in range(len(lines))]                         # Get audio tag numbers from lines
tags = remove_tags(tags)                                                # Call remove duplicates function

# Populate a URL array where each URL contains a unique tag from the CSV file
# Each URL links to a different MP3 file that corresponds to the sentences on the list
urls = ['https://audio.tatoeba.org/sentences/nld/' + tags[i] + '.mp3' for i in range(len(tags))]

for i in range(len(tags)):
    r = requests.get(urls[i])
    with open('/Users/Alexander/AppData/Roaming/Anki2/User 1/collection.media/' + tags[i] + '.mp3', 'wb') as f:
        f.write(r.content)
    #print(r.status_code)
    #print(r.headers['content-type'])
    #print(r.encoding)
    
print(' -- Download Done -- ')

lines = remove_sentences(lines)                                 # Check for duplicate tags in lines

for i in range(len(lines)): lines[i].pop(0)                     # Remove tags from first column in the CSV file

for i in range(len(lines)): lines[i].append('[sound:' + tags[i] + '.mp3]')
    
with open('Dutch tabbed.csv', 'w', encoding='utf-8-sig') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(lines)
    
writeFile.close()

print(' -- Finished -- ')

## -- Roadmap --#
#! open csv
#! take first value of each row -> 'tag'
#! check for duplicate tags
#! remove duplicate tags
#! use tag in url
#! download audio to correct directory
#! remove duplicate rows in csv
#! delete tag value in csv
#! add name of sound file using tag to end of each row in csv
#! close readcsv
#! close writecsv
## ------------#

# template:  url = 'https://audio.tatoeba.org/sentences/nld/378252.mp3'
# media loc: C:\Users\Alexander\AppData\Roaming\Anki2\User 1\collection.media


 -- Download Done -- 
 -- Finished -- 


In [1]:
# Count the number of Norwegian words in Anki audio sentences

import csv

with open('Norwegian Sentences with Audio.txt', 'r', encoding='utf-8-sig') as readFile: # Open the CSV file with translations
    reader = csv.reader(readFile, delimiter='\t')                     # Read the file, set delimiter as 'tab'
    lines  = list(reader)                                             # Save contents as list 'lines'

nor_set = [lines[i][0] for i in range(len(lines))]

all_words = {}

#print(nor_set[0])

for i in range(len(nor_set)):
    
    nor_low_spl = nor_set[i].replace('.', '').replace('?', '').replace('!', '').replace('(<--norsk)', '').replace('/', '').replace('(pronoun)', '').replace('(<--english)', '').replace(',', '').replace('maria', '').replace('malin', '').replace('halvor', '').replace('billy', '').replace('hannah', '').replace('bob', '').replace('bjorn', '').replace('pisa', '').replace('jim', '').replace('10', '').replace('15', '').replace('19', '').replace('20', '').replace('50', '').replace('70', '').lower().split()
    
    for j in range(len(nor_low_spl)):
        if nor_low_spl[j] in all_words:
            all_words[nor_low_spl[j]] = all_words[nor_low_spl[j]] + 1
        else:
            all_words[nor_low_spl[j]] = 1

sorted_words = sorted(all_words.items(), key = lambda x: x[1], reverse = True)
print('word count:', len(sorted_words))
print('from', len(nor_set), 'sentences')
print('------------')
#for i in range(len(sorted_words)): print(sorted_words[i][0], sorted_words[i][1])


with open('My Norwegian Phrases.txt', 'r', encoding='utf-8-sig') as readFile: # Open the CSV file with translations
    reader = csv.reader(readFile, delimiter='\t')                     # Read the file, set delimiter as 'tab'
    lines_other  = list(reader)                                               # Save contents as list 'lines'

nor_set_other = [lines_other[i][0] for i in range(len(lines_other))]

other_words = {}

for i in range(len(nor_set_other)):
    
    nor_low_spl_oth = nor_set_other[i].replace('.', '').replace('?', '').replace('!', '').replace('(<--norsk)', '').replace('/', '').replace('(pronoun)', '').replace('(<--english)', '').replace(',', '').replace('maria', '').replace('malin', '').replace('halvor', '').replace('billy', '').replace('hannah', '').replace('bob', '').replace('bjorn', '').replace('pisa', '').replace('jim', '').replace('10', '').replace('15', '').replace('19', '').replace('20', '').replace('50', '').replace('70', '').replace('einstein', '').replace('albert', '').lower().split()
    
    for j in range(len(nor_low_spl_oth)):
        if nor_low_spl_oth[j] in other_words:
            other_words[nor_low_spl_oth[j]] = other_words[nor_low_spl_oth[j]] + 1
        else:
            other_words[nor_low_spl_oth[j]] = 1
            
sorted_other_words = sorted(other_words.items(), key = lambda x: x[1], reverse = True)
print('other word count:', len(sorted_other_words))
print('from', len(nor_set_other), 'other sentences')
print('------------')
#for i in range(len(sorted_other_words)): print(sorted_other_words[i][0], sorted_other_words[i][1])

every_word = {}

for i in range(len(all_words)): every_word[sorted_words[i][0]] = all_words[sorted_words[i][0]]

for i in range(len(other_words)):
    if sorted_other_words[i][0] in every_word:
        every_word[sorted_other_words[i][0]] = every_word[sorted_other_words[i][0]] + sorted_other_words[i][1]
    else:
        every_word[sorted_other_words[i][0]] = sorted_other_words[i][1]
        
sorted_every_word = sorted(every_word.items(), key = lambda x: x[1], reverse = True)
print('total word count:', len(sorted_every_word))
print('from', (len(nor_set) + len(nor_set_other)), 'total sentences')
print('------------')
for i in range(len(sorted_every_word)): print(sorted_every_word[i][0], sorted_every_word[i][1])


word count: 2518
from 1668 sentences
------------
other word count: 1245
from 292 other sentences
------------
total word count: 3295
from 1960 total sentences
------------
er 491
jeg 373
det 326
i 296
å 247
en 218
på 207
han 203
ikke 203
du 197
til 191
har 178
og 145
som 132
et 126
av 121
for 118
med 110
kan 98
de 93
hun 88
at 84
vi 81
var 72
den 69
så 67
meg 65
om 65
vil 63
min 49
deg 48
dette 47
seg 47
denne 46
ut 42
hva 41
hvor 40
går 39
fra 39
men 37
ble 36
enn 36
når 35
noen 34
liker 31
ham 29
to 29
veldig 28
opp 28
mye 27
år 27
alle 27
her 27
gå 26
hans 26
kommer 26
må 26
blir 26
dag 25
ha 25
hvis 25
hadde 25
bli 24
skal 24
bare 24
være 23
over 23
oss 21
tre 21
etter 21
få 21
nå 21
din 20
sin 20
henne 19
noe 19
eller 19
hvordan 18
kom 18
vet 18
nesten 18
god 17
før 17
ser 17
gjøre 17
igjen 17
der 17
engelsk 16
snakker 16
se 16
ingen 16
ved 16
inn 16
tror 15
ta 15
alltid 15
hvem 15
selv 15
da 15
hele 15
morgen 14
huset 14
la 14
gammel 14
litt 14
mange 14
under 14
finnes 14
bor 13

In [None]:
# Count the number of Chinese characters in Chinese learning Anki file

import csv

with open('SpoonFedChinese.txt', 'r', encoding = 'utf-8-sig') as readFile:
    reader = csv.reader(readFile, delimiter = '\t')
    lines = list(reader)
    
readFile.close()

cn_set = [lines[i][2] for i in range(len(lines))]

all_char = {}

for i in range(len(cn_set)):
    
    cn_set_spl = list(cn_set[i].replace(" ", "").replace(".", ""). replace("？", "").replace("！", "").replace("。", "").replace("一", "").replace("，", "").replace("…", ""))
    
    for j in range(len(cn_set_spl)):
        if cn_set_spl[j] in all_char:
            all_char[cn_set_spl[j]] = all_char[cn_set_spl[j]] + 1
        else:
            all_char[cn_set_spl[j]] = 1
            
sorted_char = sorted(all_char.items(), key = lambda x: x[1], reverse = True)
print('Character count:', len(sorted_char))
print('from', len(cn_set), "sentences")
print('-------------------')
for i in range(len(sorted_char)): print(sorted_char[i][0], sorted_char[i][1])

In [15]:
import csv

with open('ep1.txt', 'r', encoding='utf-8-sig') as readFile: # Open the CSV file with translations
    reader = csv.reader(readFile, delimiter='\t')                     # Read the file, set delimiter as 'tab'
    lines  = list(reader)                                             # Save contents as list 'lines'

n_lines = len(lines)

try:
    f = open('new_file.txt')
except FileNotFoundError:
    print('Creating new file')
    with open('new_file.txt', 'a') as g:
        g.close()
finally:
    f.close()

# Add source tag to a new column
with open('new_file.txt', 'w', encoding='utf-8-sig') as writefile:
    for i in range(n_lines):
        writefile.write('\t'.join([lines[i][0], lines[i][1], 'NRK SKEK']) + '\n')

nor_set = [lines[i][0] for i in range(n_lines)]

nrk_words = {}

for i in range(len(nor_set)):
    
    nor_low_spl = nor_set[i].replace('.', '').replace('?', '').replace('!', '').replace('/', '').replace(',', '').replace('-', '').replace('15', '').replace('50', '').replace('60', '').replace('400000', '').lower().split()
    
    for j in range(len(nor_low_spl)):
        if nor_low_spl[j] in nrk_words:
            nrk_words[nor_low_spl[j]] = nrk_words[nor_low_spl[j]] + 1
        else:
            nrk_words[nor_low_spl[j]] = 1

nrk_sorted_words = sorted(nrk_words.items(), 
                      key=lambda x: x[1], 
                      reverse=True)

print('Word Count:', len(nrk_sorted_words))
print('from', len(nor_set), 'sentences')

print('------------')

#for i in range(len(nrk_sorted_words)):
#    print(nrk_sorted_words[i][0], nrk_sorted_words[i][1])
#if len(all_words) > len(nrk_words):
#    for i in range(len(nrk_words)):
#        if nrk_words
    
#else:
if 'er' in nrk_words:
    print(nrk_words.items())   

#new_dict = all_words.union(nrk_words)
#sorted_new = sorted(new_dict.items(),
#                   key=lambda x: x[1],
#                   reverse=True)

#print(sorted_new)

Word Count: 468
from 93 sentences
------------
dict_items([('vi', 7), ('skal', 3), ('til', 12), ('frodige', 1), ('søramerika', 1), ('som', 20), ('er', 40), ('fullt', 1), ('av', 19), ('overraskelser', 1), ('reiser', 1), ('fra', 4), ('de', 19), ('afrikanske', 1), ('slettene', 1), ('den', 7), ('enorme', 2), ('ødemarka', 1), ('i', 29), ('nordamerika', 1), ('utforske', 1), ('avsidesliggende', 1), ('fjell', 1), ('asia', 1), ('det', 37), ('viktigere', 1), ('enn', 6), ('noensinne', 1), ('å', 12), ('ta', 1), ('vare', 1), ('på', 15), ('dyrebare', 1), ('artsmangfoldet', 1), ('våre', 1), ('sju', 1), ('kontinenter', 1), ('dette', 3), ('mest', 3), ('ugjestmilde', 1), ('alle', 2), ('kontinentene', 1), ('fastlandsantarktis', 1), ('ganger', 1), ('så', 9), ('stort', 1), ('norge', 1), ('finnes', 3), ('liv', 3), ('helt', 1), ('avhengig', 1), ('havet', 4), ('omgir', 1), ('antarktis', 5), ('men', 12), ('også', 4), ('fryser', 1), ('et', 7), ('pattedyr', 1), ('lever', 3), ('her', 10), ('weddelselen', 1), ('ti

In [None]:
st = "The quick brown fox jumped over the lazy dog."

all_let = {}
st = list(st.replace(" ", "").lower().replace(".", ""))

for i in range(len(st)):
    if st[i] in all_let:
        all_let[st[i]] = all_let[st[i]] + 1
    else:
        all_let[st[i]] = 1
sort_let = sorted(all_let.items(), key = lambda x: x[1], reverse = True)
print('Letter count: ', len(sort_let))
print('from', len(st), 'letters')
print('-------------')
for i in range(len(sort_let)): print(sort_let[i][0], sort_let[i][1])

In [18]:
import requests
import random
import csv

url = 'https://ugc.futurelearn.com/uploads/assets/c7/ad/c7ad019c-918b-4cdb-be3a-b8be63ca9b32.mp3'

r = requests.get(url)

with open('/Users/Alexander/Documents/College/Audio/Harmen.mp3', 'wb') as f:
    f.write(r.content)

print(' -- Download Complete -- ')

 -- Download Complete -- 
