In [1]:
import bs4 
import requests as req
import re
import pandas as pd
import numpy as np
import datetime

In [2]:
#generate_arabic_sentences(sentence_length=4,max_sents=100000,file_name_to_save="arabic_sentences_4_dense3",min_diacritics=3)
#generate_arabic_sentences(sentence_length=3,file_name_to_save="arabic_sentences_3_huge")
#generate_arabic_sentences(sentence_length=4,file_name_to_save="arabic_sentences_4_huge")

In [51]:
#1. To load the data you either read the generated CSV or use generate_arabic_sentences() method to generate new. 
# Append the method with .values to get np.array
all_data = pd.read_csv("./arabic_sentences_4_dense3.csv").values

#2. Define the size of the training sizes and the test ratio to training sizes
trainig_lengths = [1000,3000,5000,10000,20000,50000]
test_size = 0.5 # of training

for length in trainig_lengths:
    #3. To split the sentences to Training and validation use split_train_test() on the loaded sentences
    training_data, validation_data = split_train_test(all_data[:int(length*(1+test_size))],length)
    
    #4. Save the lists
    save_list(training_data,"trainig_data_"+str(length))
    save_list(validation_data,"test_data_"+str(length)+"_"+str(len(validation_data)))

In [40]:
def generate_arabic_sentences(sentence_length,
                              base_link="http://www.baheth.info/all.jsp?term=",
                              word_list=get_words("http://www.baheth.info/all.jsp?term=عمر"),
                              max_sents=1000000,
                              file_name_to_save="sentences"+str(datetime.datetime.now()),
                              include_index=False,
                              min_diacritics=1):
    
    #generate links
    links = generate_links(base_link,word_list)
    
    #get all sents
    all_sentences = np.array([[],[]])
    for link in links:
        conc = get_sents_concatenated(link,sentence_length,min_diacritics)
        all_sentences = append(all_sentences,conc)
        print(all_sentences.shape)
        if(all_sentences.shape[1] > max_sents):
            break
    
    #save
    all_sents = np.transpose(all_sentences)
    save_list(all_sents,file_name_to_save,include_index=include_index)
    

In [43]:
def split_train_test(all_data,size_train,filter_valids_to_training_vocab=True):
    
    training_data = all_data[:size_train+1]
    validation_data = all_data[size_train:]
    
    if(filter_valids_to_training_vocab):
        vocab = get_vocab_from_sents(training_data)
        validation_data = remove_non_vocab_sents(validation_data,vocab)
    
    return training_data,validation_data

In [45]:
def remove_non_vocab_sents(all_sents,vocab):
    new_sents = np.empty((0,2))
    for sent in all_sents:
        if(sent_words_in_sents(sent[0],vocab)):
            new_sents = np.append(new_sents,[sent],axis=0)
    return new_sents

In [20]:
def get_vocab_from_sents(list_of_sents):
    vocab = set()
    for sent in list_of_sents:
        words = set(sent[0].split(" "))
        vocab = vocab | words
    return vocab

In [21]:
def sent_words_in_sents(sent,word_list):
    sent_words = sent.split(" ")
    for word in sent_words:
        if(word not in word_list):
            return False
    return True

In [22]:
def generate_links(base,terms):
    links = []
    for term in terms:
        link = base+term
        links.append(link)
    return links

In [23]:
def get_words(link):
    text = get_arabic_text(link)
    words = clean_words(text.split(" "))
    words = remove_diacritics_from_list(words)
    return words

In [24]:
def get_sents_concatenated(link,length=4,min_diacritics=1):
    sents = get_sents(link,length,min_diacritics)
    sents_nod = remove_diacritics_from_list(sents)
    sents_all = concat(sents_nod,sents)
    return sents_all

In [25]:
def get_sents(link,length=4,min_diacritics=1):
    text = get_arabic_text(link)
    sents = split_to_sentences(text,length)
    sents_clean = clean_sents(sents,length,min_diacritics)
    return sents_clean

In [26]:
def concat(list_a,list_b):
    c = np.array([list_a,list_b])
    return c

In [27]:
def append(mat_a,mat_b):
    a_b_0 = np.append(mat_a[0],mat_b[0])
    a_b_1 = np.append(mat_a[1],mat_b[1])
    c = concat(a_b_0,a_b_1)
    return c

In [28]:
def contains_diacritics(word):
    regex = "[\u0618-\u061A|\u064B-\u0653]"
    return re.search(regex,word)

In [29]:
def num_of_diacritics(sentence):
    words = sentence.split(" ")
    regex = "[\u0618-\u061A|\u064B-\u0653]"
    num = 0
    for word in words:
        if(re.search(regex,word)):
            num += 1
    
    return num

In [30]:
def clean_sents(sents,length=4,min_diacritics=1):
    
    new_sents = []
    
    for sent in sents:
                
        #remove sents less than length 
        if(len(sent.split(" ")) != length):
            continue
        
        #remove sents that have no diacritics
        if(num_of_diacritics(sent) < min_diacritics):
            continue
        
        #add finally
        new_sents.append(sent)
    
    return new_sents

In [31]:
def split_to_sentences(text,length=4):
    
    words = clean_words(text.split(" "))
    sentences = []
    sent = words[0]
    
    for i in range(1,len(words)):
        
        if(i%length==0 or i==len(words)-1):
            sentences.append(sent.strip())
            sent = ""
        
        sent += " "+words[i]
    
    return sentences

In [32]:
def clean_words(words):
    new_words = []
    
    for word in words:
        if(len(word.strip()) > 1 and len(word.strip()) < 15 and "|" not in word):
            new_words.append(word)
            
    return new_words

In [33]:
def get_arabic_text(link):
    
    if(not check_url(link) or link == None):
        return set()
    
    #request
    res = req.get(link)
        
    #get arabic words
    ar_text = re.sub('((?![\u0621-\u0655| ]).)',"",res.text)
    ar_text = ar_text.replace("\n","")
    ar_text = ar_text.replace("\s","")
    
    return ar_text

In [34]:
def cramp_page(link):

    if(not check_url(link) or link == None):
        return set()
    
    #request
    res = req.get(link)
        
    #get arabic words
    ar_text = re.sub('((?![\u0621-\u0655| ]).)',"",res.text)
    ar_text = ar_text.replace("\n","")
    ar_text = ar_text.replace("\s","")
    ar_list = ar_text.split(" ")
    ar_set = set(ar_list)
    
    return ar_set

In [35]:
def get_nested_links(home_link,link):
                    
    if(not check_url(link)):
        return link
    
    #request
    res = req.get(link)
    
    #get page links
    soup = bs4.BeautifulSoup(res.text)
    ais = soup.find_all('a')
    hrefs = [a.get('href') for a in ais]
    hrefs = set(hrefs) # unique nested links
    
    valid_links = set()
        
    for href in hrefs:
        # add the href if it is valid 
        if(href == None or len(href) < 1):
            continue
        elif(not href.startswith('/')):
            if(not href.startswith(base_link)):
                continue
            else:
                if(check_url(href)):
                    valid_links.add(href)
        else:
            aLink = home_link + href
            if(check_url(aLink)):
                valid_links.add(aLink)
    
    return valid_links

In [36]:
def remove_diacritics_from_list(alist):
    new_list = []
    for word in alist:
        word_nod = remove_diacritis(word)
        new_list.append(word_nod)
    return new_list

In [37]:
def remove_diacritis(word):
    regex = "[\u0618-\u061A|\u064B-\u0653]" #unicode diacritis
    word_no = re.sub(regex,"",word)
    return word_no

In [38]:
def save_list(aList,name,enc="utf-8",include_index=False):
    list_pd = pd.DataFrame(aList)
    name = "./"+name+".csv"
    list_pd.to_csv(path_or_buf=name,encoding=enc,columns=[0,1],header=["no_diacritics","with_diacritics"],index=include_index)

In [39]:
def check_url(url):
    
    regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    return re.match(regex, url)