In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
import kagglehub
path = kagglehub.dataset_download("abdullahshf/neet-ug-2024-results-all-india")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/neet-ug-2024-results-all-india


Importing libraries and dependencies

In [25]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import fasttext
import fasttext.util
from annoy import AnnoyIndex
from sklearn.metrics import precision_score, recall_score, f1_score


In [26]:
# Download the punkt tokenizer models
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
# Load the dataset
dataset_path = '/kaggle/input/neet-ug-2024-results-all-india/all_data.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df.dropna(inplace=True)  # Remove missing data
text_data = df['center_state'].tolist()  # Extract relevant textual column (center_state in this case)
print(f"Dataset loaded and preprocessed. Total records: {len(text_data)}")


Dataset loaded and preprocessed. Total records: 2332858


 Tokenize Text Data

In [28]:
# Tokenize text data
tokenized_text = [word_tokenize(text.lower()) for text in text_data]
print("Text tokenized successfully.")

Text tokenized successfully.


Save Tokenized Text for FastText Training

In [29]:
# Save tokenized text
with open("tokenized_text.txt", "w") as f:
    for sentence in tokenized_text:
        f.write(" ".join(sentence) + "\n")


In [30]:
fasttext_model = fasttext.train_unsupervised("tokenized_text.txt", model='skipgram')
print("FastText model trained.")

FastText model trained.


In [31]:
# Retrieve the word vectors from the FastText model
def get_word_vector(word):
    return fasttext_model.get_word_vector(word)

Create Annoy Index for Fast Search Retrieval

In [32]:
# Step 3: Annoy Index for Fast Search Retrieval
vector_size = 100  # FastText uses a default vector size of 300
annoy_index = AnnoyIndex(vector_size, 'angular')

# Add FastText vectors to the Annoy index
for i, word in enumerate(fasttext_model.get_words()):
    annoy_index.add_item(i, get_word_vector(word))

annoy_index.build(10)
annoy_index.save('annoy_index.ann')
print("Annoy index built and saved.")


Annoy index built and saved.


Define Semantic Search Function

In [35]:
# Step 4: Query Handling
def semantic_search(query, top_n=5):
    # Tokenize and vectorize query
    query_tokens = word_tokenize(query.lower())
    query_vector = sum(get_word_vector(token) for token in query_tokens if token in fasttext_model.get_words())
    query_vector /= len(query_tokens) 

    # Retrieve nearest neighbors
    nearest_neighbors = annoy_index.get_nns_by_vector(query_vector, top_n, include_distances=True)

    # Map results back to words
    results = [(fasttext_model.get_words()[idx], dist) for idx, dist in zip(*nearest_neighbors)]
    return results


In [36]:
# Example query
query = "delhi state"
results = semantic_search(query)
print(f"Results for query '{query}': {results}")

Results for query 'delhi state': [('delhi', 0.0), ('haryana', 0.03968814015388489), ('assam', 0.04130309820175171), ('telangana', 0.05164416879415512), ('gujarat', 0.051745105534791946)]
