In [5]:
# 1. Loading the Dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

file_path = r"C:\Users\sharv\AppData\Local\Programs\Python\B11\B11\Input.csv"
data = pd.read_csv(file_path)
print(f"Original dataset shape: {data.shape}")


# 2. Preprocessing: Combining & Normalizing Text
def preprocess(row):
    combined = f"{row['ln']} {row['dob']} {row['gn']} {row['fn']}"
    return " ".join(combined.lower().strip().split())

data['Processed_Text'] = data.apply(preprocess, axis=1)


# 3. Feature Extraction Using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['Processed_Text'])


# 4. Computing Cosine Similarity
similarity_matrix = cosine_similarity(tfidf_matrix)


# 5. Identifying Duplicate Records
threshold = 0.9

def find_duplicates(similarity_matrix, threshold):
    duplicates = set()
    for i in range(similarity_matrix.shape[0]):
        for j in range(i + 1, similarity_matrix.shape[1]):
            if similarity_matrix[i, j] > threshold:
                duplicates.add(j)
    return list(duplicates)

duplicate_indices = find_duplicates(similarity_matrix, threshold)


# 6. Removing Duplicates and Saving Cleaned Data
data_deduplicated = data.drop(index=duplicate_indices)
print(f"Deduplicated dataset shape: {data_deduplicated.shape}")

data_deduplicated.to_csv(
    r"C:\Users\sharv\AppData\Local\Programs\Python\B11\B11\deduplicated_dataset.csv",
    index=False
)
print("Deduplicated dataset saved to 'deduplicated_dataset.csv'.")


Original dataset shape: (149, 5)
Deduplicated dataset shape: (91, 6)
Deduplicated dataset saved to 'deduplicated_dataset.csv'.
