# A script written using fuzzy logic to find possible merges that have typos or syntax differences. 

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

I will be using the function below to clean three dataframes that don't meet our necessary merge conditions. 

In [None]:
def excelcleaner (csvfilelocation, vendor):
    df = pd.read_csv(csvfilelocation, encoding='latin1') #some files aren't read without encoding
    df_data = df.loc[:, :'DOB'] #shorten DF shape
    df_data.split(axis = 1)
    df_data['Name'] = df_data[' Last Name'] + ', ' + df_data[' First  Name'] 
    df_data = df_data.drop(['Case Manager', 'Supervisor', ' Last Name', ' First  Name'], axis = 1)
    df_data.columns = ['SSN', 'DOB', 'Name'] #primarily done to rename SSN
    df_data = df_data[['Name', 'SSN', 'DOB']]
    df_data['Source'] = vendor
    df_data['CG Tracker?'] = 'No'

    
    SSN = []
    for row in df_data['SSN']:
        row = str(row).replace('-', '')
        SSN.append(row)
    df_data['SSN'] = SSN
    
    df_data = df_data.replace(['None', 'nan', '', ' ', 'NONE', 'na'], np.nan, inplace = False)
    return df_data

In [None]:
Jasadf = excelcleaner('/Users/Carlos/Desktop/JASA.csv', 'Jasa')
Selfhelpdf = excelcleaner('/Users/Carlos/Desktop/SH.csv', 'Selfhelp')
Nyfdf = excelcleaner('/Users/Carlos/Desktop/NYF.csv', 'NYF')
Gili = pd.read_csv('/Users/Carlos/Desktop/Gili_File.csv')

Below, I am merging the three cleaned datasets and exporting the new dataframe to a .csv file. 

This new .csv file will be used in other projects (such as Cleanup_Github). For more information, please refer to the README.md file.

In [None]:
AllVendor = Jasadf.merge(Selfhelpdf, how = 'outer').merge(Nyfdf, how = 'outer')
AllVendor.to_csv("/Users/Carlos/Desktop/AllVendor.csv", index = False)

In [None]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold = 90, limit = 2):
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x : process.extract(x, s, limit = limit))
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join ([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [None]:
fmerge = fuzzy_merge(Gili, AllVendor, 'Name', 'Name', 90)
fmerge.replace(['None', 'nan', '', ' ', 'NONE', 'na'], np.nan, inplace = True)
fmerge = fmerge[fmerge['matches'].notnull()]
#fmerge.to_excel("/Users/Carlos/Desktop/Possible False Negative Matches.xlsx")