In [1]:
import ujson as json
import os
import re
import pandas as pd

from tqdm import tqdm_notebook

%matplotlib inline

import matplotlib.pyplot as plt

from joblib import Parallel,delayed

from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

sent_tokenizer = nltk.data.load('tokenizers/punkt/finnish.pickle')

In [None]:
def preprocess_line(line, min_sent_len=5): # TODO: Add features
    content = json.loads(line)['content']
    sents = []
    for doc in content:
        doc_sents = sent_tokenizer.tokenize(doc)
        for doc_sent in doc_sents:
            sent = []
            doc_tokens = word_tokenize(doc_sent)
            for doc_token in doc_tokens:
                clean_token = re.sub(r'[^\w\s]', '', doc_token)
                if len(clean_token) > 0 and len(doc_token) > 1:
                    token = doc_token.replace('\xad','')
                    sent.append(token.strip())
            if len(sent) >= min_sent_len:
                sents.append(' '.join(sent))
    return sents
    
def process_file(filename, mode='a', n_jobs=-1, out_filename='./fwe/data/preprocessed/sents.csv'):
    print(f'Processing file {filename}')
    with open(f'./fwe/data/feed/{filename}.jl', 'r', encoding='utf8') as f:
        n_lines = sum(1 for i in f)
        f.seek(0)
        sent_sets = Parallel(n_jobs=1)(delayed(preprocess_line)(l)
                                       for l in tqdm_notebook(f, total=n_lines))
        sents = [sent for sent_set in sent_sets for sent in sent_set]

    with open(out_filename, mode, encoding='utf8') as fout:
        fout.write('\n'.join(sents))
        
process_file('demi', mode='w')
process_file('iltalehti')
process_file('iltasanomat')

Processing file demi


HBox(children=(IntProgress(value=0, max=223), HTML(value='')))

Processing file iltalehti


HBox(children=(IntProgress(value=0, max=4211), HTML(value='')))

In [None]:
from spacy.lang.fi import Finnish
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc,Span,Token

import re
import string

nlp = Finnish()
sentencizer = nlp.create_pipe('sentencizer')
nlp.add_pipe(sentencizer)
tokenizer = Tokenizer(nlp.vocab)

def clean_token(token): 
    if len(re.sub(r'[^\w\s]', '', token.text).strip()) == 0:
        return ''
    token = token.text
    if token[-1] in string.punctuation:
        token = token[:-1]
    token = re.sub(r'[\"\”\'\`\(\)\[\]]', '', token)
    return token.strip()

Token.set_extension('processed', getter=clean_token, force=True)

class SentenceWriter(object):
    
    def __init__(self, input_filepath, output_filepath,
                 min_sentence_tokens=5):
        self.input_filepath = input_filepath
        self.output_filepath = output_filepath
        self.min_sentence_tokens = min_sentence_tokens
        
    def line_to_sents(self, line):
        line = json.loads(line)['content']
        sents = []
        for doc in nlp.pipe(line):
            for sent in tokenizer.pipe(s.string.strip() for s in doc.sents):
                sent_tokens = []
                for token in sent:
                    if len(token) > 0:
                        sent_tokens.append(token._.processed)
                
                if len(sent_tokens) > self.min_sentence_tokens:
                    sents.append(' '.join(sent_tokens))
        return sents
        
    def preprocess(self):
        with open(self.input_filepath, 'r', encoding='utf8') as fin:
            with open(self.output_filepath, 'a', encoding='utf8') as fout:
                for i,line in enumerate(fin):
                    sents = self.line_to_sents(line)
                    if len(sents) > 0:
                        fout.write('\n'.join(sents))
                        
                    if i % 100 == 0:
                        print('Read %s lines' % i)
                        
sw = SentenceWriter('./data/feed/iltalehti.jl', 
                    './data/preprocessed/test.csv').preprocess()

In [None]:
#sents = Parallel(n_jobs=4)(delayed(process_content)(l)
#                           for l in tqdm_notebook(df['content'].values[:1000]))

In [None]:
from gensim.models import Word2Vec

In [None]:
from gensim.models.word2vec import LineSentence

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
sents = LineSentence('./data/preprocessed/test.csv')

In [None]:
w2v = Word2Vec(
    min_count=10,
    window=4,
    size=100,
    workers=4
)

In [None]:
w2v.build_vocab(sents, progress_per=1e6)

In [None]:
w2v.train(
    sents,
    total_examples=w2v.corpus_count,
    epochs=w2v.epochs
)

In [None]:
w2v.wv.most_similar('homo')