In [1]:
import pandas as pd
from nltk.util import ngrams
import csv

In [2]:
loc = pd.read_csv("/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/CPHS data cleaning/01. Raw/people_of_india_clean_2014.csv")
temp = pd.read_csv("/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/CPHS data cleaning/01. Raw/minority_conc_census_2011.csv")

In [3]:
# Extract district columns and drop duplicates
loc_production_district = loc['district'].drop_duplicates()
temp_district = temp['districtname'].drop_duplicates()

In [4]:
# Function to generate n-grams for a string
def generate_ngrams(text, n):
    return list(ngrams(text, n))

In [5]:
# Function to calculate Jaccard similarity between two sets of n-grams
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union if union != 0 else 0

In [6]:
# Function to find the most similar matching district using n-grams
def find_matching_district(query_district, district_list, n=2, threshold=0):
    query_ngrams = set(generate_ngrams(query_district.lower(), n))
    best_match = None
    best_similarity = 0

    for district in district_list:
        # Skip if district is not a string
        if isinstance(district, float):
            continue
        
        district_ngrams = set(generate_ngrams(str(district).lower(), n))
        similarity = jaccard_similarity(query_ngrams, district_ngrams)

        if similarity > best_similarity and similarity >= threshold:
            best_similarity = similarity
            best_match = district

    return best_match, best_similarity

In [7]:
# Matching process and storing probabilities
matched_district_with_probabilities = []
for temp_district_name in loc_production_district:
    matched_district_name, matching_probability = find_matching_district(temp_district_name, temp_district)
    matched_district_with_probabilities.append((temp_district_name, matched_district_name, matching_probability))

In [8]:
# Print matched district names and probabilities
for loc_production_district, matched_district, probability in matched_district_with_probabilities:
    print(f"District from people_of_india_clean_2014: {loc_production_district} => Matched district from minority_conc_census_2011: {matched_district} (Probability: {probability:.2f})")

District from people_of_india_clean_2014: Bandipore => Matched district from minority_conc_census_2011: BANDIPUR (Probability: 0.50)
District from people_of_india_clean_2014: Ganderbal => Matched district from minority_conc_census_2011: PORBANDAR (Probability: 0.33)
District from people_of_india_clean_2014: Baramula => Matched district from minority_conc_census_2011: BARAMULLA (Probability: 0.88)
District from people_of_india_clean_2014: Srinagar => Matched district from minority_conc_census_2011: SRINAGAR (Probability: 1.00)
District from people_of_india_clean_2014: Anantnag => Matched district from minority_conc_census_2011: ANANTHNAG (Probability: 0.57)
District from people_of_india_clean_2014: Udhampur => Matched district from minority_conc_census_2011: UDHAMPUR (Probability: 1.00)
District from people_of_india_clean_2014: Jammu => Matched district from minority_conc_census_2011: JAMMU (Probability: 1.00)
District from people_of_india_clean_2014: Samba => Matched district from mino

In [9]:
# Save results to a CSV file
output_filename ="/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/CPHS data cleaning/01. Raw/district_matching_results.csv"
with open(output_filename, "w", newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["people_of_india_clean_2014 District", "Matched minority_conc_census_2011 District", "Matching Probability"])
    for loc_production_district, matched_district, probability in matched_district_with_probabilities:
        csvwriter.writerow([loc_production_district, matched_district, probability])

In [10]:
print(f"Results saved to {output_filename}")

Results saved to /Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/CPHS data cleaning/01. Raw/district_matching_results.csv
