In [None]:
import psycopg2
import pandas as pd
from configparser import ConfigParser

def config(filename='prepare_data.ini', section='phonetic'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    # read connection parameters
    params = config()

    # connect to the PostgreSQL server
    conn = psycopg2.connect(**params)
    print('Connected to the PostgreSQL database...')
    
    return conn

def read_dataframe():
    conn = db_connect()
    result = None
    try:
        query = """
            SELECT * FROM TESTING_DATA.NAME_VECTORS
            """
        result = pd.read_sql(query, con=conn, index_col='id')
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
            print('Database connection closed.')
            
    print("data retrieved from database with size: {size}".format(size= result.shape))
    return result

def store_dataframe(df):
    conn = db_connect()
    try:
        result = df.to_sql("TESTING_DATA.NAME_VECTORS", con=conn, chunksize=20000)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
            print('Database connection closed.')
            
    print("data retrieved from database with size: {size}".format(size= result.shape))
    return result

In [None]:
import pandas as pd
df = pd.DataFrame()
eng_gnames = pd.DataFrame()
eng_fnames = pd.DataFrame()
arb_gnames = pd.DataFrame()
arb_fnames = pd.DataFrame()

In [None]:
import numpy as np

def get_english_variantions(validate = False): 
    global arabic_total_result, english_total_result
    iterable_list = arabic_total_result[:]
    for arabic_name in iterable_list:
        english_list = get_english_variants(arabic_name)
        english_total_result += get_top_frequency_names(english_list)
    english_total_result = list(set(english_total_result))
    
    if validate:
        iterable_list = english_total_result[:]
        for english_name in iterable_list:
            if not validate_english_name_by_arabic_variations(english_name, arabic_total_result):
                print("removing name: {name}".format(name=english_name))
                english_total_result.remove(english_name)
        english_total_result.sort()
        
                
def get_arabic_variantions(validate = False): 
    global arabic_total_result, english_total_result
    iterable_list = english_total_result[:]
    for english_name in iterable_list:
        arabic_list = get_arabic_variants(english_name)
        arabic_total_result += get_top_frequency_names(arabic_list)
    arabic_total_result = list(set(arabic_total_result))

    if validate:
        iterable_list = arabic_total_result[:]
        for arabic_name in iterable_list:
            if not validate_arabic_name_by_english_variations(arabic_name, english_total_result):
                print("removing name: {name}".format(name=arabic_name))
                arabic_total_result.remove(arabic_name)

        arabic_total_result.sort()
             
            
def get_english_variants(arabic_name):
    global df
    
    if df.empty:
        df = read_dataframe()   
    
    result = {}
    a = df[df['arb'] == arabic_name]
    b = a[['eng', 'count']].groupby('eng').sum()
    result = b.to_dict()['count']
    return result

def get_arabic_variants(english_name): 
    global df
    
    if df.empty:
        df = read_dataframe()
    
    result = {}
    a = df[df['eng'] == english_name]
    b = a[['arb', 'count']].groupby('arb').sum()
    result = b.to_dict()['count']
    
    return result

def validate_arabic_name_by_english_variations(arabic_name, valid_english_variations):
    english_variations = get_english_variants(arabic_name)
    total_valid_count = 0
    total_invalid_count = 0
    
    total = sum(english_variations.values())
    if total < 3:
        return False

    for key, val in english_variations.items():
        if key in valid_english_variations:
            total_valid_count += val
        else:
            total_invalid_count += val
    
    #print("for {name}: valid: {valid}, invalid: {invalid}".format(name=arabic_name, valid=total_valid_count, invalid=total_invalid_count))
    
    if total_valid_count < 3:
        return False

    if total_valid_count > total_invalid_count or total_valid_count > 100:
        return True
    
    #print("english variations for {name} are: {dic}".format(name=arabic_name, dic=english_variations))
    return False
    
def validate_english_name_by_arabic_variations(english_name, valid_arabic_variations):
    arabic_variations = get_arabic_variants(english_name)
    total_valid_count = 0
    total_invalid_count = 0
    
    total = sum(arabic_variations.values())
    if total < 3:
        return False
    
    for key, val in arabic_variations.items():
        if key in valid_arabic_variations:
            total_valid_count += val
        else:
            total_invalid_count += val
    
    #print("for {name}: valid: {valid}, invalid: {invalid}".format(name=arabic_name, valid=total_valid_count, invalid=total_invalid_count))

    if total_valid_count < 3:
        return False
    
    if total_valid_count > total_invalid_count or total_valid_count > 100:
        return True
    
    #print("arabic variations for {name} are: {dic}".format(name=english_name, dic=arabic_variations))
    return False

def get_top_frequency_names(list):
    total = sum(list.values())
    lower_accepted_frequency = 100
    threshold = 10
    
    max_value = max(list.values())
    if total > 6561:
        threshold = 1
    else:
        threshold -= total**(1./4.)
        
    #print("threshold: {thre}, total: {tot}".format(thre=threshold, tot=total))
    matched_list = [key for key, val in list.items() 
                    if len(key) > 2 and 
                    key not in top_noise_data and 
                    (val / total * 100 > threshold or val >= lower_accepted_frequency)]
    #print("top matched_list: {thre}".format(thre=matched_list))
    not_matched_list = [ (key, val) for key, val in list.items() if val / total * 100 <= threshold and val < lower_accepted_frequency]
    matched_list_with_composite = [key for key, val in list.items() 
                                   if any(match in key and len(key) < len(match) * 2  for match in matched_list)]
    if(len(matched_list_with_composite) - len(matched_list) > 3):
        return matched_list
    
    return matched_list_with_composite

def get_random_names(names, number=20):
    rnd = []
    rnd.extend(np.random.randint(low=0, high=int(np.floor(len(names) * 0.01)), size=int(np.ceil(number * 0.5))))
    rnd.extend(np.random.randint(low=int(np.floor(len(names) * 0.01)+1), high=int(np.floor(len(names) * 0.5)), size=int(np.ceil(number * 0.3))))
    rnd.extend(np.random.randint(low=int(np.floor(len(names) * 0.5)+1), high=len(names), size=int(np.ceil(number * 0.2))))
    return [names.loc[rnd[num],'name'] for num in range(number)]
    
def rnd_english_given_names(number=20):
    global eng_gnames
    
    if eng_gnames.empty:
        eng_gnames = read_eng_given_names()

    return get_random_names(eng_gnames, number)

def rnd_arabic_given_names(number=20):
    global arb_gnames
    
    if arb_gnames.empty:
        arb_gnames = read_arb_given_names()

    return get_random_names(arb_gnames, number)

def rnd_english_family_names(number=20):
    global eng_fnames
    
    if eng_fnames.empty:
        eng_fnames = read_eng_family_names()

    return get_random_names(eng_fnames, number)

def rnd_arabic_family_names(number=20):
    global arb_fnames
    
    if arb_fnames.empty:
        arb_fnames = read_arb_family_names()

    return get_random_names(arb_fnames, number)

In [None]:
con = db_connect()
print(con)
df = read_dataframe()
print(df)

In [None]:
%inline matplotlib

import networkx as nx
import matplotlib.pyplot as plt


def draw_graph(graph):

    # extract nodes from graph
    nodes = set([n1 for n1, n2 in graph] + [n2 for n1, n2 in graph])

    # create networkx graph
    G=nx.Graph()

    # add nodes
    for node in nodes:
        G.add_node(node)

    # add edges
    for edge in graph:
        G.add_edge(edge[0], edge[1])

    # draw graph
    pos = nx.shell_layout(G)
    nx.draw(G, pos)

    # show graph
    plt.show()

# draw example
graph = [(20, 21),(21, 22),(22, 23), (23, 24),(24, 25), (25, 20)]
draw_graph(graph)

In [None]:
%inline matplotlib
import matplotlib.pyplot as plt

val_map = {'e': 1.0,
           'a': 0.1}


attr_dict = nx.get_node_attributes(G,'t')
values = [val_map[attr_dict[node]] for node in nx.get_node_attributes(G,'t')]

pos=nx.spring_layout(G) # positions for all nodes
nx.draw(G,pos=pos, node_color=values)
nx.draw_networkx_labels(G,pos=pos)
nx.draw_networkx_edge_labels(G,pos=pos)
plt.show()

In [None]:
import psycopg2
from configparser import ConfigParser
import random
import networkx as nx
import pandas as pd

def config(filename='prepare_data.ini', section='phonetic'):
    parser = ConfigParser()
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    params = config()
    conn = psycopg2.connect(**params)
    print('Connected to the PostgreSQL database...')
    
    return conn

def read_dataframe(query):
    conn = db_connect()
    result = None
    try:
        result = pd.read_sql(query, con=conn)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
            print('Database connection closed.')
    return result


def generate_graph(query, pickle_file):
    df = read_dataframe(query)
    G = nx.Graph()
    
    print(df)

    for index1, row1 in df.iterrows():
        for index2, row2 in df.iterrows():
            name1 = row1['gn']
            name2 = row2['gn']
            
            if not G.has_edge(name1, name2):
                G.add_edge(name1, name2, weight=random.random())
    nx.write_gpickle(G,pickle_file)

def generate_given_names_graph():
    query = """
            SELECT DISTINCT(ENG) AS GN FROM testing_data.GIVEN_NAMES_MASTER
            WHERE ENG IS NOT NULL AND ENG != ''
            LIMIT 10000
            """
            
    generate_graph(query, "/home/jupyter/notebooks/PoC/data-preparation/pickle/test_gn_similarity_graph.gpickle")

def read_given_names_graph():
    return nx.read_gpickle("/home/jupyter/notebooks/PoC/data-preparation/pickle/test_gn_similarity_graph.gpickle")    

In [None]:
generate_given_names_graph()

In [None]:
from scipy.sparse import lil_matrix
from scipy.sparse.linalg import spsolve
from numpy.linalg import solve, norm
from numpy.random import rand
import random
import psycopg2
from configparser import ConfigParser
import pandas as pd


def config(filename='prepare_data.ini', section='phonetic'):
    parser = ConfigParser()
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    params = config()
    conn = psycopg2.connect(**params)
    print('Connected to the PostgreSQL database...')
    
    return conn

def read_dataframe(query):
    conn = db_connect()
    result = None
    try:
        result = pd.read_sql(query, con=conn)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
            print('Database connection closed.')
    return result


def generate_mx(query, pickle_file):
    df = read_dataframe(query)
    A = lil_matrix((10000, 10000))

    for index1, row1 in df.iterrows():
        for index2, row2 in df.iterrows():
            name1 = row1['gn']
            name2 = row2['gn']
            similarity = random.random()
            
            if (similarity >= 0.6):
                A[index1, index2] = similarity

def generate_given_names_mx():
    query = """
            SELECT DISTINCT(ENG) AS GN FROM testing_data.GIVEN_NAMES_MASTER
            WHERE ENG IS NOT NULL AND ENG != ''
            LIMIT 10000
            """
            
    generate_mx(query, "/home/jupyter/notebooks/PoC/data-preparation/pickle/test_gn_similarity_graph.gpickle")

In [None]:
generate_given_names_mx()

In [None]:
import numpy as np

def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

sentence_m = np.array([1, 1, 1, 1, 0, 2, 0, 0, 0]) 
sentence_h = np.array([2, 2, 2, 2, 0, 4, 0, 0, 0])
sentence_w = np.array([0, 0, 0, 1, 3, 0, 1, 1, 4])

print(cos_sim(sentence_m, sentence_h))
print(cos_sim(sentence_m, sentence_w)) 

In [None]:
print(sentence_m)

In [None]:
v = np.random.randint(10, size=8).astype(dtype=np.float32)
print(np.array2string(v, precision=0, separator=','))

In [None]:
name_vector = read_dataframe()

In [None]:
print(name_vector)

In [None]:
for index, row in name_vector.iterrows():
    vect = np.fromstring(row['vector'].strip('[]'), dtype=int, sep=',')
    row['vector'] = vect

In [None]:
%%time

for index, row in name_vector.iterrows():
    cos = cos_sim(v, row['vector'])
    if cos > 0.99:
        print("{a} : {b} -> {c}".format(a=v, b=row['vector'], c=cos))

In [None]:
dataset = np.array([[]])
i = 0
for index, row in name_vector.iterrows():
    if dataset.size == 0 :
        dataset = np.array([row['vector']])
    else :
        dataset = np.vstack([dataset, row['vector']])

In [None]:
dataset = dataset.astype(dtype=np.float32)
print(dataset[0])

In [None]:
import copy
bck = copy.deepcopy(dataset)
print(bck[0])

In [None]:
from __future__ import print_function
import numpy as np
import falconn
import timeit
import math

dataset = copy.deepcopy(bck)

number_of_queries = 1000
# we build only 50 tables, increasing this quantity will improve the query time
# at a cost of slower preprocessing and larger memory footprint, feel free to
# play with this number
number_of_tables = 50



# It's important not to use doubles, unless they are strictly necessary.
# If your dataset consists of doubles, convert it to floats using `astype`.
assert dataset.dtype == np.float32

# Choose random data points to be queries.
print('Generating queries')
np.random.seed(4057218)
np.random.shuffle(dataset)
queries = dataset[len(dataset) - number_of_queries:]
dataset = dataset[:len(dataset) - number_of_queries]
print('Done')

# Perform linear scan using NumPy to get answers to the queries.
print('Solving queries using linear scan')
t1 = timeit.default_timer()
answers = []
for query in queries:
    answers.append(np.dot(dataset, query).argmax())
t2 = timeit.default_timer()
print('Done')
print('Linear scan time: {} per query'.format((t2 - t1) / float(
    len(queries))))

# Center the dataset and the queries: this improves the performance of LSH quite a bit.
print('Centering the dataset and queries')
center = np.mean(dataset, axis=0)
dataset -= center
queries -= center
print('Done')

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(dataset[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 1
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 18-bit hashes so that each table has
# 2^18 bins; this is a good choise since 2^18 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(18, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
table = falconn.LSHIndex(params_cp)
table.setup(dataset)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

query_object = table.construct_query_object()

# find the smallest number of probes to achieve accuracy 0.9
# using the binary search
print('Choosing number of probes')
number_of_probes = number_of_tables

def evaluate_number_of_probes(number_of_probes):
    query_object.set_num_probes(number_of_probes)
    score = 0
    for (i, query) in enumerate(queries):
        if answers[i] in query_object.get_candidates_with_duplicates(
                query):
            score += 1
    return float(score) / len(queries)

while True:
    accuracy = evaluate_number_of_probes(number_of_probes)
    print('{} -> {}'.format(number_of_probes, accuracy))
    if accuracy >= 0.9:
        break
    number_of_probes = number_of_probes * 2
if number_of_probes > number_of_tables:
    left = number_of_probes // 2
    right = number_of_probes
    while right - left > 1:
        number_of_probes = (left + right) // 2
        accuracy = evaluate_number_of_probes(number_of_probes)
        print('{} -> {}'.format(number_of_probes, accuracy))
        if accuracy >= 0.9:
            right = number_of_probes
        else:
            left = number_of_probes
    number_of_probes = right
print('Done')
print('{} probes'.format(number_of_probes))

# final evaluation
t1 = timeit.default_timer()
score = 0
for (i, query) in enumerate(queries):
    if query_object.find_nearest_neighbor(query) == answers[i]:
        score += 1
t2 = timeit.default_timer()

print('Query time: {}'.format((t2 - t1) / len(queries)))
print('Precision: {}'.format(float(score) / len(queries)))

In [None]:
%%time
from scipy.spatial import distance

print(v)
print(dataset[1])

dist = 0
for u in dataset:
    d = distance.cosine(v, u)
    #d = distance.euclidean(v, u)
    if (d > dist):
        dist = d
    
print(dist)

In [None]:
np.dot(dataset, v)

In [5]:
from __future__ import print_function
import numpy as np
import falconn
import timeit
import math
import copy

#dataset = copy.deepcopy(bck)
dataset = copy.deepcopy(random_vectors)
dataset = dataset.astype(dtype=np.float32)

# we build only 50 tables, increasing this quantity will improve the query time
# at a cost of slower preprocessing and larger memory footprint, feel free to
# play with this number
number_of_tables = 50

# It's important not to use doubles, unless they are strictly necessary.
# If your dataset consists of doubles, convert it to floats using `astype`.
assert dataset.dtype == np.float32


params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(dataset[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(24, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
table = falconn.LSHIndex(params_cp)
table.setup(dataset)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

query_object = table.construct_query_object()

number_of_probes = number_of_tables
#number_of_probes = 10

print('Done')
print('{} probes'.format(number_of_probes))

# final evaluation
t1 = timeit.default_timer()
response = query_object.find_nearest_neighbor(query)
print("query:            {q}".format(q=query))
print("nearest_neighbor: {nn}".format(nn=dataset[response]))
t2 = timeit.default_timer()

print('Query time: {}'.format((t2 - t1)))

Constructing the LSH table
Done
Construction time: 388.4927228866145
Done
50 probes
query:            [ 0.63783312  0.00409418  0.50211561  0.30761099  0.22072244 -0.37277198
 -0.11898448  0.75035644  0.78763163 -0.82036281 -0.36739284 -0.6848284
  0.91254425 -0.79613221  0.96386838 -0.78057832  0.30549878 -0.69954884
  0.18261124  0.2515724  -0.95707363  0.73961365 -0.91854197 -0.80992085
  0.32259238 -0.91326624 -0.67281902  0.14103541  0.91712958 -0.36514041
  0.27035442 -0.94641179 -0.75013936  0.84753209 -0.49573171  0.54618055
  0.97861308 -0.81299049  0.89324182  0.60030675 -0.36409688 -0.48318368
 -0.44730324 -0.3910405   0.09788305 -0.59802473 -0.17750613  0.19272159
 -0.63840258  0.19285232 -0.2698209  -0.47305384  0.66874039  0.14557534
 -0.85832685 -0.92510468  0.44801518  0.145027   -0.3924512   0.16517544]
nearest_neighbor: [ 0.28347668 -0.34237945  0.24419194 -0.22370352  0.22561137  0.73121858
 -0.15567179 -0.10226461  0.36341268 -0.90843171 -0.78412026  0.66123503
  0.

In [33]:
query = 2* np.random.random_sample(60) -1
query = query.astype(dtype=np.float32)
#dataset -= np.mean(dataset, axis=0)

In [35]:
# final evaluation
t1 = timeit.default_timer()
response = query_object.find_nearest_neighbor(query)
print("query:            {q}".format(q=query))
#print("nearest_neighbor: {nn}".format(nn=dataset[response]))
print(response)
t2 = timeit.default_timer()

print('Query time: {}'.format((t2 - t1)))

query:            [-0.90300858 -0.5539453  -0.47259071 -0.51987958  0.72921056  0.03061111
  0.899382    0.02109051 -0.48245111  0.80966711  0.2980493  -0.92306358
 -0.31926629  0.35113055 -0.18029171  0.21239281  0.38495556 -0.93474603
 -0.56777173  0.72636497  0.94998837 -0.24052756  0.11417535 -0.97955108
  0.26304597 -0.85263938 -0.74149936  0.49073747 -0.44872081  0.58801275
  0.79450661 -0.42468438  0.04616422 -0.69894981 -0.06934123 -0.20130967
 -0.38877609 -0.51590067  0.60183126 -0.57862329 -0.54407114  0.13731219
  0.54318547  0.65324581  0.95106143  0.55723077  0.69782883  0.92653692
 -0.47858879 -0.10922093  0.28426006 -0.17109036  0.78654563  0.13084193
  0.54457498  0.63398337 -0.36922461 -0.25434035 -0.43515331 -0.88885134]
11411745
Query time: 0.004988847300410271


In [37]:
# final evaluation
t1 = timeit.default_timer()
response = query_object.find_near_neighbors(query, 20)
print("query:          {q}".format(q=query))
#print("near neighbors: {nn}".format(nn=dataset[response]))
print(response)
t2 = timeit.default_timer()

print('Query time: {}'.format((t2 - t1)))
print(len(dataset))

query:          [-0.90300858 -0.5539453  -0.47259071 -0.51987958  0.72921056  0.03061111
  0.899382    0.02109051 -0.48245111  0.80966711  0.2980493  -0.92306358
 -0.31926629  0.35113055 -0.18029171  0.21239281  0.38495556 -0.93474603
 -0.56777173  0.72636497  0.94998837 -0.24052756  0.11417535 -0.97955108
  0.26304597 -0.85263938 -0.74149936  0.49073747 -0.44872081  0.58801275
  0.79450661 -0.42468438  0.04616422 -0.69894981 -0.06934123 -0.20130967
 -0.38877609 -0.51590067  0.60183126 -0.57862329 -0.54407114  0.13731219
  0.54318547  0.65324581  0.95106143  0.55723077  0.69782883  0.92653692
 -0.47858879 -0.10922093  0.28426006 -0.17109036  0.78654563  0.13084193
  0.54457498  0.63398337 -0.36922461 -0.25434035 -0.43515331 -0.88885134]
[15962089, 18412310, 20850940, 11451692, 17743573, 19443808, 11411745, 22993825, 2340536, 21709933, 21169797, 23372709, 23842807, 16176509, 21263416, 12875144, 21722886, 1087897]
Query time: 0.008274488151073456
25000000


In [28]:
print(len(dataset))

25000000


In [29]:
import numpy as np
import copy


random_vectors = 2 * np.random.random_sample((25, 60)) - 1 
random_vectors = random_vectors.astype(dtype=np.float32)

In [None]:
print(len(random_vectors))

In [None]:
print(len(random_vectors))

In [None]:
dataset = copy.deepcopy(random_vectors)
print(len(dataset))

In [2]:
import pickle
pickle.dump(random_vectors, open( "pickle/random_vectors_25M.pickle", "wb"), protocol=4)
#random_vectors = pickle.load( open("pickle/random_vectors.pickle", "rb" ) )

In [49]:
import json
import requests

# GET /queryscotches
print(json.dumps("aaa"))

"aaa"


In [15]:
import pandas as pd

adf = pd.read_csv(
    '/home/jupyter/notebooks/PoC/data-preparation/output/transliteration_datasets/arb_positive_trans.tsv', 
    delimiter='\t', 
    index_col=0,
    usecols=[0,1,2])
edf = pd.read_csv(
    '/home/jupyter/notebooks/PoC/data-preparation/output/transliteration_datasets/eng_positive_trans.tsv',
    delimiter='\t', 
    index_col=0,
    usecols=[0,1,2])

In [50]:
edf.loc[len(edf)] = ['test', 'test']
edf

Unnamed: 0,eng,arb
0,MOHAMMED,محمد
1,OMAR,عمر
2,HASSAN,حسن
3,MARIAM,مريم
4,IBRAHIM,ابرهيم
5,MARIA,مارية
6,SALEH,صالح
7,ALI,علي
8,KHALED,خالد
9,KAMAL,كمال


In [46]:
adf.to_pickle(protocol=2, path='ae.pkl')

TypeError: to_pickle() got an unexpected keyword argument 'protocol'

In [28]:
edf.to_pickle('ea.pkl', protocol=2)

In [44]:
adf = pd.read_pickle('ae.pkl')
edf = pd.read_pickle('ea.pkl')

In [39]:
edf.loc[edf['eng'] == 'OMASR']

Unnamed: 0,eng,arb


In [38]:
adf.loc[adf['arb'] == 'صالح']

Unnamed: 0,arb,eng
6,صالح,SALEH


In [40]:
list("HAMZA".encode('utf-8'))

[72, 65, 77, 90, 65]

In [43]:
batch_tokens = []
batch_tokens.append(list("HAMZA".encode('utf-8')))

for tokens in batch_tokens:
    print(tokens)


[72, 65, 77, 90, 65]


In [57]:
import io
print (io.DEFAULT_BUFFER_SIZE)


8192
