In [178]:
import bs4 
import requests as req
import re
import pandas as pd
import numpy as np
import datetime

In [None]:
#generate_arabic_sentences(sentence_length=2,max_sents=10,file_name_to_save="test")
generate_arabic_sentences(sentence_length=3,file_name_to_save="arabic_sentences_3_huge")
generate_arabic_sentences(sentence_length=4,file_name_to_save="arabic_sentences_4_huge")

In [214]:
def generate_arabic_sentences(sentence_length,
                              base_link="http://www.baheth.info/all.jsp?term=",
                              word_list=get_words("http://www.baheth.info/all.jsp?term=عمر"),
                              max_sents=1000000,
                              file_name_to_save="sentences"+str(datetime.datetime.now()),
                              include_index=False):
    
    #generate links
    links = generate_links(base_link,word_list)
    
    #get all sents
    all_sentences = np.array([[],[]])
    for link in links:
        conc = get_sents_concatenated(link,sentence_length)
        all_sentences = append(all_sentences,conc)
        if(all_sentences.shape[1] > max_sents):
            break
    
    #save
    all_sents = np.transpose(all_sentences)
    save_list(all_sents,file_name_to_save,include_index=include_index)
    

In [110]:
def generate_links(base,terms):
    links = []
    for term in terms:
        link = base+term
        links.append(link)
    return links

In [109]:
def get_words(link):
    text = get_arabic_text(link)
    words = clean_words(text.split(" "))
    words = remove_diacritics_from_list(words)
    return words

In [183]:
def get_sents_concatenated(link,length=4):
    sents = get_sents(link,length)
    sents_nod = remove_diacritics_from_list(sents)
    sents_all = concat(sents_nod,sents)
    return sents_all

In [56]:
def get_sents(link,length=4):
    text = get_arabic_text(link)
    sents = split_to_sentences(text,length)
    sents_clean = clean_sents(sents,length)
    return sents_clean

In [103]:
def concat(list_a,list_b):
    c = np.array([list_a,list_b])
    return c

In [155]:
def append(mat_a,mat_b):
    a_b_0 = np.append(mat_a[0],mat_b[0])
    a_b_1 = np.append(mat_a[1],mat_b[1])
    c = concat(a_b_0,a_b_1)
    return c

In [31]:
def contains_diacritics(word):
    regex = "[\u0618-\u061A|\u064B-\u0653]"
    return re.search(regex,word)

In [52]:
def clean_sents(sents,length=4):
    
    new_sents = []
    
    for sent in sents:
                
        #remove sents less than length 
        if(len(sent.split(" ")) != length):
            continue
        
        #remove sents that have no diacritics
        if(contains_diacritics(sent) == None):
            continue
        
        
        #add finally
        new_sents.append(sent)
    
    return new_sents

In [61]:
def split_to_sentences(text,length=4):
    
    words = clean_words(text.split(" "))
    sentences = []
    sent = words[0]
    
    for i in range(1,len(words)):
        
        if(i%length==0 or i==len(words)-1):
            sentences.append(sent.strip())
            sent = ""
        
        sent += " "+words[i]
    
    return sentences

In [185]:
def clean_words(words):
    new_words = []
    
    for word in words:
        if(len(word.strip()) > 1 and len(word.strip()) < 15 and "|" not in word):
            new_words.append(word)
            
    return new_words

In [2]:
def get_arabic_text(link):
    
    if(not check_url(link) or link == None):
        return set()
    
    #request
    res = req.get(link)
        
    #get arabic words
    ar_text = re.sub('((?![\u0621-\u0655| ]).)',"",res.text)
    ar_text = ar_text.replace("\n","")
    ar_text = ar_text.replace("\s","")
    
    return ar_text

In [329]:
def cramp_page(link):

    if(not check_url(link) or link == None):
        return set()
    
    #request
    res = req.get(link)
        
    #get arabic words
    ar_text = re.sub('((?![\u0621-\u0655| ]).)',"",res.text)
    ar_text = ar_text.replace("\n","")
    ar_text = ar_text.replace("\s","")
    ar_list = ar_text.split(" ")
    ar_set = set(ar_list)
    
    return ar_set

In [4]:
def get_nested_links(home_link,link):
                    
    if(not check_url(link)):
        return link
    
    #request
    res = req.get(link)
    
    #get page links
    soup = bs4.BeautifulSoup(res.text)
    ais = soup.find_all('a')
    hrefs = [a.get('href') for a in ais]
    hrefs = set(hrefs) # unique nested links
    
    valid_links = set()
        
    for href in hrefs:
        # add the href if it is valid 
        if(href == None or len(href) < 1):
            continue
        elif(not href.startswith('/')):
            if(not href.startswith(base_link)):
                continue
            else:
                if(check_url(href)):
                    valid_links.add(href)
        else:
            aLink = home_link + href
            if(check_url(aLink)):
                valid_links.add(aLink)
    
    return valid_links

In [69]:
def remove_diacritics_from_list(alist):
    new_list = []
    for word in alist:
        word_nod = remove_diacritis(word)
        new_list.append(word_nod)
    return new_list

In [5]:
def remove_diacritis(word):
    regex = "[\u0618-\u061A|\u064B-\u0653]" #unicode diacritis
    word_no = re.sub(regex,"",word)
    return word_no

In [213]:
def save_list(aList,name,enc="utf-8",include_index=False):
    list_pd = pd.DataFrame(aList)
    name = "./"+name+".csv"
    list_pd.to_csv(path_or_buf=name,encoding=enc,columns=[0,1],header=["no_diacritics","with_diacritics"],index=include_index)

In [7]:
def check_url(url):
    
    regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    return re.match(regex, url)