In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import unicodedata
from collections import defaultdict

# change output length in notebook
#pd.set_option('display.max_colwidth', 30)

In [2]:
METADATA_FILE = "E:/Features/MIR/EuropeanaSounds/metadata/metadata.csv"

## Pre-Processing

### Load Metadata from csv

In [3]:
# read csv-data (separated by semicolons)
metadata = pd.read_csv(METADATA_FILE, sep=";", encoding="utf-8")

# convert nan-values to empty strings
metadata = metadata.fillna("")

# use the "id" field as index
metadata = metadata.set_index("id")

metadata.head()

Unnamed: 0_level_0,contributor,country,created,creator,dataProvider,date,description,format,identifier,language,medium,provider,publisher,relation,spatial,subject,title,type,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
/09301/02DABC7F5850EABCA553BCEEEEAE2B6543A1CA67,"Schorr, A. [lyricist], Burstein, Peisach [sing...",france,,,Bibliothèque Medem - Maison de la Culture Yiddish,,Voice solo,,Publisher number : Columbia 13371/108300,yi,,Judaica Europeana,Columbia,Ai dai dera dai tchastushkes,"United States [publication], United Kingdom [p...",,Vos toig es aich vos darft ihr es,"sound recording, Chanson comique",
/09301/03DE04818BECB920A21EA63EA9D5FD1C49CF74A0,"Burstein, Peisach [singer]",france,,,Bibliothèque Medem - Maison de la Culture Yiddish,,Full orchestra ; Voice solo,,Publisher number : Columbia 13380/105740,yi,,Judaica Europeana,Columbia,Die poilische chassene,"New-York [recording], United States [publicati...",,Pitia Diritia,"sound recording, Chanson comique",
/09301/0701AE796F7B504E67B8CEB031C1061E113BE3F4,"Kremer, Isa [singer]",france,,,Bibliothèque Medem - Maison de la Culture Yiddish,,"Full orchestra ; Voice solo, Identifying marki...",,Publisher number : Polydor 561157/67083 B,yi,,Judaica Europeana,Polydor,Oi Abram,Paris [publication],,Sitz ich mir in schtibele,"Chanson d&apos;amour, sound recording",
/09301/0A3E8D891310E10CEC0B6D7790ADE1470E6541B8,"Potock, Bernard [conductor], Gerro, Henri [sin...",france,,,Bibliothèque Medem - Maison de la Culture Yiddish,,"Full orchestra ; Voice solo, Henri Gerro du ca...",,Publisher number : Elesdisc / Leon Speiser LS-7B,yi,,Judaica Europeana,Elesdisc Leon Speiser,Ich will nicht sein kein rebbe,Paris [publication],,A briew fun a dienst,"Chanson humouristique, sound recording",
/09301/0BF6DFA79787E76872BA33061C84A79588318953,"Ellstein, Abraham [instrumentalist ; piano], F...",france,,,Bibliothèque Medem - Maison de la Culture Yiddish,,Chorus ; Piano ; Voice solo,,Publisher number : Metro 100-1b/CJ-100/M113,yi,,Judaica Europeana,Metro,A nign&apos;dl,,,Taiere malke,"sound recording, Chanson pour enfants",


### Concatenate columns

In [4]:
documents = ((((((
                metadata.creator + " ")
                .str.cat(metadata.contributor) + " ")
                .str.cat(metadata.title) + " ")
                .str.cat(metadata.description) + " ")
                .str.cat(metadata.subject) + " ")
                .str.cat(metadata.country)
                .str.strip()).values

# example
documents[0]

u'Schorr, A. [lyricist], Burstein, Peisach [singer], Secunda [composer] Vos toig es aich vos darft ihr es Voice solo  france'

### Data Preprocessing

In [5]:
# requires package "stop_words"
from stop_words import get_stop_words

# define list of multilingual stopwords
stop_words = []
stop_words.extend(get_stop_words('en'))
stop_words.extend(get_stop_words('de'))
stop_words.extend(get_stop_words('fr'))
stop_words.extend(get_stop_words('it'))
stop_words.extend(get_stop_words('pt'))
stop_words.extend(get_stop_words('ro'))
stop_words.extend(get_stop_words('spanish'))

In [6]:
tokenizer = RegexpTokenizer(r'\w+')

texts = []

for document in documents:
    
    valid_words = []
    
    # skip empty documents
    if len(document) > 0:
        
        # for each lower-case transformed word 
        for word in tokenizer.tokenize(document.lower()):
            
            # remove surrounding whitespace and line endings
            word = word.strip()
            
            # normalize, remove accents and umlaute
            word = unicodedata.normalize('NFKD', word).encode('ASCII', 'ignore')
            
            # remove stopwords
            if (word not in stop_words and len(word) > 1):
                valid_words.append(word)
            
    texts.append(valid_words)


In [None]:
# remove words that appear only once
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if (frequency[token] > 1)] for text in texts]

## Train word2vec

In [None]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
model = gensim.models.Word2Vec(texts, size=1000, window=10, min_count=5, workers=4)