In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
from ethnicolr import pred_wiki_name
from ethnicolr import census_ln

names = pd.read_csv("americans_by_descent.csv", encoding='latin-1')

first_names = set()
last_names = set()

with open('first.txt', 'r', encoding='utf-8') as textfile:
    for line in textfile:
        line = line.strip()
        first_names.add(line.upper())

with open('last.txt', 'r', encoding='utf-8') as textfile:
    for line in textfile:
        line = line.strip()
        last_names.add(line.upper())

  names = pd.read_csv("americans_by_descent.csv", encoding='latin-1')


In [4]:
def check_name_fuzzy(name, name_set, threshold):
    name = str(name).upper()
    words = name.split()

    for word in words:
        if word in name_set:
            return True

        if len(word) < 4:
            current_threshold = threshold + 10
        else:
            current_threshold = threshold

        for candidate in name_set:
            partial = fuzz.partial_ratio(word, candidate)
            token = fuzz.token_sort_ratio(word, candidate)
            ratio = fuzz.ratio(word, candidate)

            if abs(len(word) - len(candidate)) <= 2:
                weighted_score = (partial * 0.5) + (token * 0.3) + (ratio * 0.2)
                if weighted_score >= current_threshold:
                    return True
    return False

In [5]:
def process_name(name, x, y):
    name_split = name.split()
    if len(name_split) < 2:
        return {'First Name': name, 'Last Name': '', 'Predicted Value': False}
        
    first_name = name_split[0]
    last_name = name_split[-1]

    pattern_score = 0

    sa_prefixes = ['raj', 'dev', 'deep', 'hari', 'sun', 'sur', 'pra', 'aman', 'anu']

    sa_letter_patterns = ['sh', 'th', 'dh', 'bh', 'gh', 'kh', 'ch']

    for prefix in sa_prefixes:
        if first_name.lower().startswith(prefix):
            pattern_score += 0.2
            
    for pattern in sa_letter_patterns:
        if pattern in first_name.lower() or pattern in last_name.lower():
            pattern_score += 0.1

    if not first_name or not last_name or pd.isna(first_name) or pd.isna(last_name):
        return {'First Name': first_name, 'Last Name': last_name, 'Predicted Value': False}

    fuzzy_match_first = check_name_fuzzy(first_name, first_names, x - 5)
    fuzzy_match_last = check_name_fuzzy(last_name, last_names, x)

    fuzzy_match = fuzzy_match_first or fuzzy_match_last

    if fuzzy_match or pattern_score > 0.2:
        census_preds = census_ln(pd.DataFrame({'Last Name': [last_name]}), lname_col="Last Name")
        census_preds['pctapi'] = pd.to_numeric(census_preds['pctapi'], errors='coerce')

        if census_preds['pctapi'].values[0] >= y or fuzzy_match_first or pattern_score > 0.4:
            wiki_preds = pred_wiki_name(pd.DataFrame({'First Name': [first_name], 'Last Name': [last_name]}),
                                        fname_col="First Name", lname_col="Last Name")

            wiki_preds['Asian,IndianSubContinent'] = pd.to_numeric(wiki_preds.get('Asian,IndianSubContinent', pd.Series([0])), errors='coerce')
            asian_indian_value = wiki_preds['Asian,IndianSubContinent'].values[0]

            adjusted_threshold = y * (0.9 if pattern_score > 0 else 1.0)
            is_predicted_sa = asian_indian_value >= adjusted_threshold if pd.notna(asian_indian_value) else False
        else:
            is_predicted_sa = False
    else:
        is_predicted_sa = False

    return {'First Name': first_name, 'Last Name': last_name, 'Predicted Value': is_predicted_sa}

In [None]:
real_vals = []
for i in range(0,499):
  real_vals.append(False)
for i in range(500,1000):
  real_vals.append(True)
  
real_vals_pd = pd.DataFrame(data=real_vals)
real_vals_pd.rename(columns={0:"Real"})

Unnamed: 0,Real
0,False
1,False
2,False
3,False
4,False
...,...
994,True
995,True
996,True
997,True


In [None]:
from re import I
import random

test_names = []

for i in range(0,500):
  val = random.randint(0,25561)
  test_names.append(names.at[val, 'name'])

for i in range(0, 500):
  val = random.randint(27501,len(names)-1)
  test_names.append(names.at[val, 'name'])

results = []
total_rows = len(test_names)

x = 7.5 
y = 0.12 

results = []
for i in range(len(test_names)):
    row = test_names[i]
    row_results = process_name(row, x, y)
    results.append(row_results)
    
results_df = pd.DataFrame(results)
frames = [results_df, real_vals_pd]
final = pd.concat(frames, axis=1)
final = final.rename(columns={0: "Real Value"})

correct = sum(1 for i in range(len(final)) 
             if final.loc[i,"Predicted Value"] == final.loc[i,"Real Value"])
accuracy = correct / len(final)

print(f"New parameters: x={x}, y={y}")
print(f"New accuracy: {accuracy * 100:.2f}%")
# Best Percentage: 91.7%, x = 7.5, y = 0.12


New parameters: x=7.5, y=0.12
New accuracy: 90.00%


In [None]:
df = pd.read_csv("master sheet w_providers - Sheet1.csv", header=None)
df.columns = ['Full Name']
df = df.dropna()
df = df[df['Full Name'].str.strip() != '']

x = 7.5
y = 0.12
results = df['Full Name'].apply(lambda name: process_name(name, x, y))
results_df = pd.DataFrame(results.tolist())
final_df = pd.concat([df, results_df], axis=1)

sa_names_df = final_df[final_df['Predicted Value'] == True]

sa_names_df.to_csv("south_asian_names.csv", index=False)


