In [2]:
# -*- coding: utf8 -*-
import gensim
import re
import numpy as np
from nltk import ngrams

# =========================
# ==== Helper Methods =====

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

def get_vec(n_model,dim, token):
    vec = np.zeros(dim)
    is_vec = False
    if token not in n_model.wv:
        _count = 0
        is_vec = True
        for w in token.split("_"):
            if w in n_model.wv:
                _count += 1
                vec += n_model.wv[w]
        if _count > 0:
            vec = vec / _count
    else:
        vec = n_model.wv[token]
    return vec

def calc_vec(pos_tokens, neg_tokens, n_model, dim):
    vec = np.zeros(dim)
    for p in pos_tokens:
        vec += get_vec(n_model,dim,p)
    for n in neg_tokens:
        vec -= get_vec(n_model,dim,n)
    
    return vec   

## -- Retrieve all ngrams for a text in between a specific range
def get_all_ngrams(text, nrange=3):
    text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
    tokens = [token for token in text.split(" ") if token.strip() != ""]
    ngs = []
    for n in range(2,nrange+1):
        ngs += [ng for ng in ngrams(tokens, n)]
    return ["_".join(ng) for ng in ngs if len(ng)>0 ]

## -- Retrieve all ngrams for a text in a specific n
def get_ngrams(text, n=2):
    text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
    tokens = [token for token in text.split(" ") if token.strip() != ""]
    ngs = [ng for ng in ngrams(tokens, n)]
    return ["_".join(ng) for ng in ngs if len(ng)>0 ]

## -- filter the existed tokens in a specific model
def get_existed_tokens(tokens, n_model):
    return [tok for tok in tokens if tok in n_model.wv ]





# ============================   
# ====== N-Grams Models ======

t_model = gensim.models.Word2Vec.load('models/full_grams_cbow_100_twitter.mdl')

# python 3.X
token = clean_str(u'ابو تريكه').replace(" ", "_")
# python 2.7
# token = clean_str(u'ابو تريكه'.decode('utf8', errors='ignore')).replace(" ", "_")

if token in t_model.wv:
    most_similar = t_model.wv.most_similar( token, topn=10 )
    for term, score in most_similar:
        term = clean_str(term).replace(" ", "_")
        if term != token:
            print(term, score)

# تريكه 0.752911388874054
# حسام_غالي 0.7516342401504517
# وائل_جمعه 0.7244222164154053
# وليد_سليمان 0.7177559733390808
# ...

# =========================================
# == Get the most similar tokens to a compound query
# most similar to 
# عمرو دياب + الخليج - مصر

pos_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['عمرو دياب', 'الخليج'] if t.strip() != ""]
neg_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['مصر'] if t.strip() != ""]

vec = calc_vec(pos_tokens=pos_tokens, neg_tokens=neg_tokens, n_model=t_model, dim=t_model.vector_size)

most_sims = t_model.wv.similar_by_vector(vec, topn=10)
for term, score in most_sims:
    if term not in pos_tokens+neg_tokens:
        print(term, score)

# راشد_الماجد 0.7094649076461792
# ماجد_المهندس 0.6979793906211853
# عبدالله_رويشد 0.6942606568336487
# ...

# ====================
# ====================




# ============================== 
# ====== Uni-Grams Models ======

##t_model = gensim.models.Word2Vec.load('models/full_uni_cbow_100_twitter.mdl')

# python 3.X
##token = clean_str(u'تونس')
# python 2.7
# token = clean_str('تونس'.decode('utf8', errors='ignore'))

##most_similar = t_model.wv.most_similar( token, topn=10 )
##for term, score in most_similar:
####    print(term, score)

# ليبيا 0.8864325284957886
# الجزائر 0.8783721327781677
# السودان 0.8573237061500549
# مصر 0.8277812600135803
# ...



# get a word vector
word_vector = t_model.wv[ token ]

ابوتريكه 0.9565805792808533
حازم_امام 0.864891767501831
وائل_جمعه 0.8543370366096497
تريكه 0.8521531820297241
حسام_غالي 0.846001148223877
عماد_متعب 0.8435681462287903
حسن_شحاته 0.8425122499465942
عمرو_زكي 0.8408412337303162
حسام_حسن 0.8271308541297913
رمضان_صبحي 0.8270741701126099
راشد_الماجد 0.7094648480415344
ماجد_المهندس 0.6979794502258301
عبدالله_رويشد 0.6942605376243591
عبدالله_الرويشد 0.6927955746650696
خالد_عبدالرحمن 0.6894348859786987
رابح_صقر 0.684174120426178
عبدالمجيد_عبدالله 0.684122622013092
محمد_عبده 0.6824554204940796
نبيل_شعيل 0.6798837184906006
زايد_الصالح 0.6735830903053284


In [4]:
# -*- coding: utf8 -*-
import gensim
import re
import numpy as np
from nltk import ngrams

# =========================
# ==== Helper Methods =====

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

def get_vec(n_model,dim, token):
    vec = np.zeros(dim)
    is_vec = False
    if token not in n_model.wv:
        _count = 0
        is_vec = True
        for w in token.split("_"):
            if w in n_model.wv:
                _count += 1
                vec += n_model.wv[w]
        if _count > 0:
            vec = vec / _count
    else:
        vec = n_model.wv[token]
    return vec

def calc_vec(pos_tokens, neg_tokens, n_model, dim):
    vec = np.zeros(dim)
    for p in pos_tokens:
        vec += get_vec(n_model,dim,p)
    for n in neg_tokens:
        vec -= get_vec(n_model,dim,n)
    
    return vec   

## -- Retrieve all ngrams for a text in between a specific range
def get_all_ngrams(text, nrange=3):
    text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
    tokens = [token for token in text.split(" ") if token.strip() != ""]
    ngs = []
    for n in range(2,nrange+1):
        ngs += [ng for ng in ngrams(tokens, n)]
    return ["_".join(ng) for ng in ngs if len(ng)>0 ]

## -- Retrieve all ngrams for a text in a specific n
def get_ngrams(text, n=2):
    text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
    tokens = [token for token in text.split(" ") if token.strip() != ""]
    ngs = [ng for ng in ngrams(tokens, n)]
    return ["_".join(ng) for ng in ngs if len(ng)>0 ]

## -- filter the existed tokens in a specific model
def get_existed_tokens(tokens, n_model):
    return [tok for tok in tokens if tok in n_model.wv ]





# ============================   
# ====== N-Grams Models ======

t_model = gensim.models.Word2Vec.load('models/full_grams_cbow_100_twitter.mdl')

# python 3.X
token = clean_str(u'سرقة').replace(" ", "_")
# python 2.7
# token = clean_str(u'ابو تريكه'.decode('utf8', errors='ignore')).replace(" ", "_")

if token in t_model.wv:
    most_similar = t_model.wv.most_similar( token, topn=10 )
    for term, score in most_similar:
        term = clean_str(term).replace(" ", "_")
        if term != token:
            print(term, score)

# تريكه 0.752911388874054
# حسام_غالي 0.7516342401504517
# وائل_جمعه 0.7244222164154053
# وليد_سليمان 0.7177559733390808
# ...

# =========================================
# == Get the most similar tokens to a compound query
# most similar to 
# عمرو دياب + الخليج - مصر

pos_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['انطوان سماحة', 'سعودية'] if t.strip() != ""]
neg_tokens=[ clean_str(t.strip()).replace(" ", "_") for t in ['لبنان'] if t.strip() != ""]

vec = calc_vec(pos_tokens=pos_tokens, neg_tokens=neg_tokens, n_model=t_model, dim=t_model.vector_size)

most_sims = t_model.wv.similar_by_vector(vec, topn=10)
for term, score in most_sims:
    if term not in pos_tokens+neg_tokens:
        print(term, score)

# راشد_الماجد 0.7094649076461792
# ماجد_المهندس 0.6979793906211853
# عبدالله_رويشد 0.6942606568336487
# ...

# ====================
# ====================




# ============================== 
# ====== Uni-Grams Models ======

##t_model = gensim.models.Word2Vec.load('models/full_uni_cbow_100_twitter.mdl')

# python 3.X
##token = clean_str(u'تونس')
# python 2.7
# token = clean_str('تونس'.decode('utf8', errors='ignore'))

##most_similar = t_model.wv.most_similar( token, topn=10 )
##for term, score in most_similar:
####    print(term, score)

# ليبيا 0.8864325284957886
# الجزائر 0.8783721327781677
# السودان 0.8573237061500549
# مصر 0.8277812600135803
# ...



# get a word vector
word_vector = t_model.wv[ token ]

وسرقه 0.8054364919662476
مصادره 0.745844304561615
لسرقه 0.732149600982666
اتلاف 0.7281239032745361
حيازه 0.7137411832809448
تهريب 0.71235191822052
اختلاس 0.7055280208587646
سرقات 0.7028773427009583
بيع 0.6991260051727295
خطف 0.6953885555267334
قصيميه 0.613539457321167
عتيبيه 0.5760815143585205
قرويه 0.5639134049415588
حجازيه 0.549694299697876
طخمه 0.5465852618217468
مصوره 0.5462727546691895
بكستانيه 0.5425755977630615
رسامه 0.5425392389297485
قصمنجيه 0.540238082408905
غامديه 0.5364717245101929


In [5]:
print(word_vector)
#index2word_set = set(word2vec_vectors.wv.index2word)

[ 0.12898168 -0.12776525 -2.3782592   1.5905364   2.1040397   0.6249796
  1.6603498   0.26098573  0.19134237 -0.38898647  0.06065198 -0.30491403
  3.223809   -0.5292832   0.18292628 -3.4605308  -3.1353152  -0.39001396
  0.7233246   0.93164885 -1.71681     1.3676271  -2.7011962   2.3073795
 -0.19108006  0.575296   -0.71562177 -2.1097858   0.9799234   0.296826
  0.6394561  -0.4804854   0.41199136 -0.73732334 -3.0052764   3.4832702
  1.2604014   1.2374989  -0.9858779   0.912821   -0.9510982   1.1159695
  1.2028342  -2.1836925  -0.5642931   1.4381927   0.9122004  -1.4751711
  2.0118773  -2.2427366  -1.0178751   3.3120286  -3.4177668   1.0405232
 -2.3204424  -2.8205185   0.8214644  -3.101375   -0.8910767  -0.835392
  1.2876183   0.38810432  1.2272987  -1.6125118   0.22605917 -1.6748503
 -1.4342185   0.519484   -0.03773731 -0.12334368 -0.9083439   1.500213
 -1.8346211  -1.3816308  -1.3337007   0.64141244 -0.7631837  -0.5235907
 -0.9442183  -1.9284645  -0.7089352   1.2635968   1.4791634   1.9

In [28]:
#collection1=collection[2:9]
#for i in range(len(collection1)):
 #   word_vector=t_model.wv(collection1[i].split())
collection1='محضر تحقيق بالشكوى المقدمة من المدعوة رانيا المصري صاحبة مؤسسة بوكالة المحامية نسرين حرب ضد المدعوة هدى فرحات بجرم قدح وذم ومخالفة قرار إداري '
#for i in range(len(collection1)):
index2word_set = t_model.wv.index2word

print(index2word_set[0:100])
print(t_model.wv['من'])



['.', '%', ':', 'من', '#', 'رابطويب', 'في', 'الله', ')', '(', 'و', 'علي', '!', 'لا', 'ان', 'ما', ']', '؟', 'اللهم', 'الا', 'كل', 'انا', 'يا', ',', 'اللي', 'ولا', 'الي', 'لا_اله', 'ي', 'عن', '…', 'هو', 'بس', 'انت', 'ب', 'لي', 'مع', 'م', 'ل', 'هذا', 'اللهم_اني', 'لو', 'ف', 'هه', 'كان', 'لك', 'اذا', 'الله_اكبر', 'او', '”', 'بعد', 'والله', 'ه', 'ع', 'الا_الله', 'الناس', 'ا', 'سبحان', 'حتي', 'فيه', 'له', 'ال', 'يوم', 'اي', '*', 'قلبي', 'الذي', 'ك', 'اني', 'شي', 'انك', '?', 'قال', 'احد', 'وانا', 'انه', 'هي', 'قبل', 'يارب', 'استغفر', 'اليوم', 'الحمدلله', 'مش', 'ت', ';', '“', 'ربي', 'يعني', 'غير', 'لما', 'شاهد', 'وكل', 'خير', 'عليه', 'الدنيا', 'ن', 'رب', '♡', '=', 'غرد_بذكر']
[-2.03215146e+00  2.36172247e+00 -1.07531315e-02 -1.33089781e+00
  4.09337610e-01  1.67818296e+00  4.49576092e+00  1.24167097e+00
  2.07176134e-01 -5.34855890e+00 -1.27874267e+00  2.15164528e-05
  2.37422538e+00 -3.59334707e-01 -8.83399129e-01 -2.79245257e+00
 -2.01776457e+00 -1.24322498e+00  1.48167551e+00 -2.01816034e+0