In [1]:
import pandas as pd
import numpy as np
from dask import dataframe as dd
import string
import random
import names
from barnum import gen_data
import Levenshtein as lev

# Creating data

In [2]:
def create_data(N=10):
    name=[]
    company=[]
    email=[]
    for i in range(N):
        name.append(gen_data.create_name(full_name=False))
        company.append(gen_data.create_company_name())
        email.append(gen_data.create_email())
    return name, company, email

In [3]:
name, company, email = create_data(N=1000)

In [5]:
df = pd.DataFrame({'ID':np.arange(1,1001,1), 'Name': name,
                  'Company': company,
                  'Email': email})

In [6]:
data = dd.from_pandas(df, npartitions=1)

In [7]:
data.head(3)

Unnamed: 0,ID,Name,Company,Email
0,1,Joanne,Virtual Max Analysis,T.Clements@veroUtwisi.tv
1,2,Stacey,Galaxy Architecture Contract,Zack.Newberry@facilisipraesent.gov
2,3,Ophelia,Data Star Max,Bethany@eratfeugait.info


# Search function 

In [None]:
# Dask dataframes have been used since the requirement for the task involves handling large amount of data. 
# Dask can enable efficient parallel computations on single machines by leveraging their multi-core CPUs
# Dask provides ways to scale Pandas, Scikit-Learn, and Numpy workflows more natively, with minimal rewriting. 
# It integrates well with these tools so that it copies most of their API and uses their data structures internally

In [8]:
def search_record(name, cat='name'):
    '''
    The function takes two arguments, one is the string by which you want to seach the data and the other one 
    is category which is 'name' or 'company'. Default for cat is display name
    '''
    zero=[]
    one=[]
    two=[]
    three=[]
    if cat == 'name':
        for i in data['ID']:
            distance = lev.distance(data[data['ID']==i]['Name'].head(1).reset_index(drop=True)[0].lower(),name.lower())
            if distance == 0:
                zero.append(i)
            elif distance == 1:
                one.append(i)
            elif distance == 2:
                two.append(i)
            elif distance == 3:
                three.append(i)
    else:
        for i in data['ID']:
            distance = lev.distance(data[data['ID']==i]['Company'].head(1).reset_index(drop=True)[0].lower(),name.lower())
            if distance == 0:
                zero.append(i)
            elif distance == 1:
                one.append(i)
            elif distance == 2:
                two.append(i)
            elif distance == 3:
                three.append(i)
                
    final = zero+one+two+three
    final_list = final[0:10]
    df = data[data['ID'].isin(final_list)].compute()
    df = df.sort_values(by='Name').reset_index(drop=True)
    return df

In [9]:
search_record('Kareem')

[677, 831, 66, 121, 191, 259, 262, 469, 514, 554]


Unnamed: 0,ID,Name,Company,Email
0,259,Arleen,West Provider,Theresa.Ivey@facilisisveniamquis.com
1,191,Jarred,Venture Design Power,Tim@nullatationullamcorper.tv
2,514,Kara,West East Source,Retha@volutpatdoloremagna.us
3,677,Kareem,Network Provider Bell,Mellissa@duisiusto.info
4,831,Karen,Max East,Jacob.Byrne@iustoodiolorem.tv
5,66,Karin,Federated Internet Star,R.Dowling@consequatfacilisi.info
6,121,Karol,Vision Vision,S.Hamm@velaliquam.org
7,554,Karon,Omega Future Star,N.Lynch@dolorautem.info
8,262,Marie,Studio Design,T.Stansberry@luptatumet.org
9,469,Tyree,Atlantic Bell,K.Doolittle@nibhexerci.edu
