In [1]:
import pandas as pd
import numpy as np
from dask import dataframe as dd
import string
import random
import names
from barnum import gen_data
import Levenshtein as lev

# Creating data

In [2]:
def create_data(N=10):
    name=[]
    company=[]
    email=[]
    for i in range(N):
        name.append(gen_data.create_name(full_name=False))
        company.append(gen_data.create_company_name())
        email.append(gen_data.create_email())
    return name, company, email

In [3]:
name, company, email = create_data(N=1000)

In [4]:
df = pd.DataFrame({'ID':np.arange(1,1001,1), 'Name': name,
                  'Company': company,
                  'Email': email})

In [5]:
data = dd.from_pandas(df, npartitions=1)

In [6]:
data.head(3)

Unnamed: 0,ID,Name,Company,Email
0,1,Socorro,Architecture Atlantic General,L.Champion@tincidunthendrerit.eu
1,2,Tammi,General Venture Electronic,Scot.Ortega@quisillum.net
2,3,Theodora,Star Application,Joey@etaccumsanea.info


# Search function 

In [7]:
# Dask dataframes have been used since the requirement for the task involves handling large amount of data. 
# Dask can enable efficient parallel computations on single machines by leveraging their multi-core CPUs
# Dask provides ways to scale Pandas, Scikit-Learn, and Numpy workflows more natively, with minimal rewriting. 
# It integrates well with these tools so that it copies most of their API and uses their data structures internally

In [8]:
def search_record(name, cat='name'):
    '''
    The function takes two arguments, one is the string by which you want to seach the data and the other one 
    is category which is 'name' or 'company'. Default for cat is display name
    '''
    zero=[]
    one=[]
    two=[]
    three=[]
    if cat == 'name':
        for i in data['ID']:
            distance = lev.distance(data[data['ID']==i]['Name'].head(1).reset_index(drop=True)[0].lower(),name.lower())
            if distance == 0:
                zero.append(i)
            elif distance == 1:
                one.append(i)
            elif distance == 2:
                two.append(i)
            elif distance == 3:
                three.append(i)
    else:
        for i in data['ID']:
            distance = lev.distance(data[data['ID']==i]['Company'].head(1).reset_index(drop=True)[0].lower(),name.lower())
            if distance == 0:
                zero.append(i)
            elif distance == 1:
                one.append(i)
            elif distance == 2:
                two.append(i)
            elif distance == 3:
                three.append(i)
                
    final = zero+one+two+three
    final_list = final[0:10]
    df = data[data['ID'].isin(final_list)].compute()
    df = df.sort_values(by='Name').reset_index(drop=True)
    return df

In [9]:
search_record('Kareem')

Unnamed: 0,ID,Name,Company,Email
0,183,Arlen,Electronics People,Bethany@blanditlobortis.gov
1,190,Carmen,Adventure People Federated,Bryce.Toler@nonummyeum.net
2,56,Daren,Internet Interactive Vision,Dwayne.Acevedo@tationat.com
3,153,Harley,Provider Virtual Solutions,K.Register@elitseddignissim.info
4,258,Harley,Provider Atlantic Adventure,T.Armstrong@suscipitet.eu
5,44,Kara,North Universal General,Cindi@laoreetdolor.com
6,269,Karina,Application Analysis Technology,D.Pinckney@teaugue.eu
7,107,Karrie,Future Design Alpha,Eileen@laoreetmolestie.eu
8,58,Kaye,Galaxy Hill,Jefferson.Nunez@enimea.com
9,126,Reed,Design Virtual Resource,Mitzi@praesentex.edu
