In [None]:
import os
import gensim
import re

In [None]:
model_path = '/media/sf_VBox_Shared/Arabic/Analyses/Fiqh_final2/wordembedding/'
corpus_path = '/media/sf_VBox_Shared/Arabic/fiqh_corpus/txt/'

In [None]:
def normalize_arabic(text):
    # Remove non-arabic characters
    nonarab_chars = '[^\u0621-\u064A ]'
    text = re.sub(nonarab_chars, '', text)
    text = text.strip()
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    return text

class CorpusArabic(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                words = [normalize_arabic(w) for w in line.split(' ')]
                words = [w for w in words if len(w)>0]
                if not len(words)==0:
                    yield words

## Finetune existing model

In [None]:
model_original = gensim.models.Word2Vec.load(os.path.join(model_path, 'wikipedia_cbow_100'))

In [None]:
corpus = CorpusArabic(corpus_path)
model_original.train(corpus, total_examples=model_original.corpus_count, epochs=5)

In [None]:
model_original.save(os.path.join(model_path, 'wikipedia_cbow_100_finetuned'))

In [None]:
# Save vectors
model_original.wv.save(os.path.join(model_path, 'wikipedia_cbow_100_finetuned_wv'))

## Train original model

In [None]:
model_custom = gensim.models.word2vec.Word2Vec(
    iter=10,
    size=100, 
    window=5,
    sg=0)

In [None]:
corpus = CorpusArabic(corpus_path)

In [None]:
#model_custom_w.reset_from(model_original)
model_custom.build_vocab(corpus)

In [None]:
model_custom.train(corpus, 
                   epochs=model_custom.epochs,
                    total_examples=model_custom.corpus_count
                  )

In [None]:
fname = 'fiqh-norm-i{}-s{}-w{}-sg{}'.format(
    model_custom.epochs, 
    model_custom.vector_size,
    model_custom.window,
    model_custom.sg)

In [None]:
model_custom.save(os.path.join(model_path, fname))
model_custom.wv.save(os.path.join(model_path, fname+'_wv'))

## Train model on stemmed corpus

Use adhtools.stemmed-xml-to-txt to make make txt files from xml, one line corresponds to one chunk

In [None]:
# from lxml import etree

# class CorpusStemmedArabic(object):
#     def __init__(self, dirname):
#         self.dirname = dirname
 
#     def __iter__(self):
#         for fname in os.listdir(self.dirname):
#             context = etree.iterparse(fn, events=('end', ),
#                                           tag=('analysis'))
#             yield [elem.attrib['stem'] for event, elem in context]

In [None]:
model_custom = gensim.models.word2vec.Word2Vec(
    iter=10,
    size=100, 
    window=5,
    sg=0)

In [None]:
# corpus_path = '/media/sf_VBox_Shared/Arabic/Fiqh/2018-08-14-stemmed-LIGHT10'
# corpus = CorpusArabic(corpus_path)

In [None]:
txt_path = '/media/sf_VBox_Shared/Arabic/Fiqh/2019-02-18-Fiqh-LIGHT10-txt/'
corpus = gensim.models.word2vec.PathLineSentences(txt_path)

In [None]:
model_custom.build_vocab(corpus)

In [None]:
model_custom.train(corpus, 
                   epochs=model_custom.epochs,
                    total_examples=model_custom.corpus_count
                  )

In [None]:
fname = 'stemmed-fiqh-i{}-s{}-w{}-sg{}'.format(
    model_custom.epochs, 
    model_custom.vector_size,
    model_custom.window,
    model_custom.sg)

In [None]:
model_custom.save(os.path.join(model_path, fname))
model_custom.wv.save(os.path.join(model_path, fname+'_wv'))

In [None]:
model_custom.