<a href="https://colab.research.google.com/github/akshaya-bharadhwaj/J008-SNLP-Labs/blob/master/J008_Text_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [2]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [3]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re



# **Load the Data**

In [4]:
resolved_queries = pd.read_csv('resolved_queries.csv')
new_queries = pd.read_csv('new_queries.csv')

In [5]:
resolved_queries.head()

Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings
3,4,Forgot password and unable to reset
4,5,Unable to upload files to the server


In [6]:
new_queries.head()

Unnamed: 0,Variation_Query,Matches_With_Query_ID
0,Unabel to conect to the internet,1
1,Can’t connect to internet,1
2,Intenet not working,1
3,Payment failed while chekout,2
4,Payment did not go through during chckout,2


# **Preprocess the Text**

In [7]:
# Preprocess text data
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [8]:
# Apply preprocessing

resolved_queries['Pre_Resolved_Query'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess)
new_queries['Variation_Query'] = new_queries['Variation_Query'].apply(preprocess)

# **Fuzzy Matching**

In [11]:
def fuzzy_match(query, choices, scorer = fuzz.token_sort_ratio, threshold=80):
  best_match, score, index = process.extractOne(query, choices)
  if score >= threshold:
    return best_match
  else:
    return None

In [12]:
print("Fuzzy Matching Example:")
for i, query in enumerate(new_queries['Variation_Query'].head(5)):
    match = fuzzy_match(query, resolved_queries['Pre_Resolved_Query'])
    print(f"New Query: {query}\nBest Match: {match}\n")

Fuzzy Matching Example:
New Query: unabel to conect to the internet
Best Match: unable to connect to the internet

New Query: cant connect to internet
Best Match: unable to upload files to the server

New Query: intenet not working
Best Match: None

New Query: payment failed while chekout
Best Match: payment failed during checkout

New Query: payment did not go through during chckout
Best Match: None



# **TF-IDF with Cosine Similarity**

In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_resolved = tfidf_vectorizer.fit_transform(resolved_queries['Pre_Resolved_Query'])
tfidf_matrix_new = tfidf_vectorizer.transform(new_queries['Variation_Query'])

In [14]:
cosine_similarities = cosine_similarity(tfidf_matrix_new, tfidf_matrix_resolved)

# **Best Matches**

In [15]:
# Determine the best matches
def find_best_matches(cosine_similarities, resolved_queries, new_queries, threshold=0.5):
    matches = []
    for i, sims in enumerate(cosine_similarities):
        best_idx = np.argmax(sims)
        best_score = sims[best_idx]
        if best_score >= threshold:
            matches.append({
                'Variation_Query': new_queries.iloc[i]['Variation_Query'],
                'Resolved_Query': resolved_queries.iloc[best_idx]['Pre_Resolved_Query'],
                'Similarity_Score': best_score
            })
    return pd.DataFrame(matches)

In [16]:
# Finding the best matches with a threshold of 0.5
matches_df = find_best_matches(cosine_similarities, resolved_queries, new_queries, threshold=0.5)

print("Matches based on TF-IDF Cosine Similarity:")
print(matches_df.head())

Matches based on TF-IDF Cosine Similarity:
                             Variation_Query  \
0           unabel to conect to the internet   
1                   cant connect to internet   
2               payment failed while chekout   
3  payment did not go through during chckout   
4                 payment issue at check out   

                      Resolved_Query  Similarity_Score  
0  unable to connect to the internet          0.839042  
1  unable to connect to the internet          0.836936  
2     payment failed during checkout          0.707107  
3     payment failed during checkout          0.707107  
4     payment failed during checkout          0.500000  


In [17]:
# Save the matches to a CSV file
matches_df.to_csv('matched_queries.csv', index=False)