<a href="https://colab.research.google.com/github/ahelmasri87/aaa/blob/main/TF_IDF_(for_basic_similarity_matching)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets (Ensure they have "Service Description" and "Code" columns)
sbs_df = pd.read_excel("/content/sbs_codes.xlsx")
cpt_df = pd.read_excel("/content/cpt_codes.xlsx")
achi_df = pd.read_excel("/content/achi_codes.xlsx")

print(sbs_df.columns)
print(cpt_df.columns)
print(achi_df.columns)


Index(['Code', 'Service Description'], dtype='object')
Index(['Code', 'Service Description'], dtype='object')
Index(['Code', 'Service Description'], dtype='object')


In [35]:
#TF-IDF (for basic similarity matching).

# Standardizing column names
sbs_df.columns = sbs_df.columns.str.strip().str.lower()
cpt_df.columns = cpt_df.columns.str.strip().str.lower()
achi_df.columns = achi_df.columns.str.strip().str.lower()

# Convert descriptions to lowercase for uniformity
sbs_df["service description"] = sbs_df["service description"].str.lower()
cpt_df["service description"] = cpt_df["service description"].str.lower()
achi_df["service description"] = achi_df["service description"].str.lower()


def get_code_from_description():
    """
    Asks the user to input a medical service description and choose the code type.
    Uses TF-IDF and cosine similarity to find the best match.
    Displays the matched code, description, and similarity percentage.
    """
    # Step 1: Ask for user input
    user_input = input("Enter the medical service description: ").strip().lower()

    # Step 2: Ask the user to choose the code type
    code_type = input("Which code do you need? (CPT, SBS, ACHI): ").strip().upper()

    # Select the correct dataset
    if code_type == "SBS":
        df = sbs_df
    elif code_type == "CPT":
        df = cpt_df
    elif code_type == "ACHI":
        df = achi_df
    else:
        print("Invalid selection! Please choose CPT, SBS, or ACHI.")
        return

    # Vectorizing service descriptions using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["service description"])

    # Transform user input into a vector
    user_vec = vectorizer.transform([user_input])

    # Compute similarity between input and descriptions
    similarity_scores = cosine_similarity(user_vec, tfidf_matrix)

    # Get the index of the best-matching description
    best_match_idx = similarity_scores.argmax()
    best_match_score = similarity_scores[0][best_match_idx]  # Get the highest similarity score

    # Convert similarity score to percentage
    similarity_percentage = round(best_match_score * 100, 2)

    # Retrieve the best match details
    best_match_description = df.iloc[best_match_idx]["service description"]
    best_match_code = df.iloc[best_match_idx]["code"]

    # Print the outcome
    print("\nBest Match Found:")
    print(f"Code: {best_match_code}")
    print(f"Description: {best_match_description.capitalize()}")
    print(f"Similarity: {similarity_percentage}%")

# Run the function
get_code_from_description()


Enter the medical service description: Incision and drainage of pilonidal cyst
Which code do you need? (CPT, SBS, ACHI): cpt

Best Match Found:
Code: 10080
Description: Incision & drainage pilonidal cyst simple
Similarity: 79.86%
