In [2]:
import pandas as pd
import re
import spacy
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv('assets/result.csv', index_col=[0])

In [4]:
df

Unnamed: 0,Question,Answer
0,1. What is Data Science?,An interdisciplinary field that constitutes va...
1,2. What is the difference between data analyti...,Data science involves the task of transforming...
2,3. What are some of the techniques used for sa...,Data analysis can not be done on a whole volum...
3,4. List down the conditions for Overfitting an...,Overfitting: The model performs well only for ...
4,5. Differentiate between the long and wide for...,Long format Data\nWide-Format Data\n\n\n\nHere...
...,...,...
151,Situational question based on the resume.,"If you have a gap in your resume, recruiters w..."
152,Is Data Science Hard To Learn?,No. Anyone with the desire and commitment can ...
153,Is Data Science a Good Career?,Yes. There is a huge demand fordata scientists...
154,How Long Does It Take To Transition Into Data ...,If you have a background in math or computer s...


In [5]:
def remove_enumeration(text):
    # Use regular expressions to remove enumeration (e.g., "1.", "2.")
    cleaned_text = re.sub(r'^\d+\.\s+', '', text)
    return cleaned_text


In [6]:
# Apply the remove_enumeration function to the 'Question' column
df['Question'] = df['Question'].apply(remove_enumeration)

In [7]:
df

Unnamed: 0,Question,Answer
0,What is Data Science?,An interdisciplinary field that constitutes va...
1,What is the difference between data analytics ...,Data science involves the task of transforming...
2,What are some of the techniques used for sampl...,Data analysis can not be done on a whole volum...
3,List down the conditions for Overfitting and U...,Overfitting: The model performs well only for ...
4,Differentiate between the long and wide format...,Long format Data\nWide-Format Data\n\n\n\nHere...
...,...,...
151,Situational question based on the resume.,"If you have a gap in your resume, recruiters w..."
152,Is Data Science Hard To Learn?,No. Anyone with the desire and commitment can ...
153,Is Data Science a Good Career?,Yes. There is a huge demand fordata scientists...
154,How Long Does It Take To Transition Into Data ...,If you have a background in math or computer s...


In [8]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [9]:
# Preprocess text data (e.g., convert to lowercase)
df['Question'] = df['Question'].str.lower()

In [10]:
def are_questions_duplicates(q1, q2):
    # Tokenize and vectorize the questions
    doc1 = nlp(q1)
    doc2 = nlp(q2)
    
    # Calculate cosine similarity between the question vectors
    similarity_score = cosine_similarity(doc1.vector.reshape(1, -1), doc2.vector.reshape(1, -1))[0][0]
    
    # Set a similarity threshold (experiment with different values)
    threshold = 0.9
    
    # Check if the questions are duplicates
    return similarity_score >= threshold

In [11]:
# Create a list to store indices of duplicate questions
indices_to_drop = []

# Iterate through the dataset to find duplicates
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        if are_questions_duplicates(df.at[i, 'Question'], df.at[j, 'Question']):
            indices_to_drop.append(j)  # Store the index of the duplicate question

# Drop the duplicate questions from the DataFrame
df_cleaned = df.drop(indices_to_drop)

# Reset the index to ensure it's sequential
df_cleaned.reset_index(drop=True, inplace=True)

In [12]:
print(indices_to_drop)

[112, 140, 134, 129, 153, 127, 72, 130, 149, 141]


In [24]:
df_cleaned.to_csv('assets/final_result.csv')