## Intelligent Query Processing and Information Retrieval System for App Reviews Analysis

# Build the inverted index
inverted_index = build_inverted_index(reviews_df)

# Build the positional index
positional_index = build_positional_index(reviews_df)

# Create the TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(reviews_df['cleaned_content'])

def simplified_expand_query(query):
    # Only expand to a limited set of synonyms or skip this entirely
    synonyms = set()
    for word in query.split():
        synsets = wordnet.synsets(word)
        if synsets:
            # Add only the first synonym to keep it simple
            synonyms.add(synsets[0].lemmas()[0].name().replace('_', ' '))
    return " ".join(synonyms) if synonyms else query

def flexible_boolean_search(query, index):
    # Search for any of the terms instead of all
    terms = query.split()
    document_ids = set()
    for term in terms:
        document_ids.update(index.get(term, set()))
    return document_ids

def process_query(query):
    expanded_query = simplified_expand_query(query)
    document_ids = flexible_boolean_search(expanded_query, inverted_index)
    if not document_ids:
        print("No documents found with expanded query. Trying with the original query...")
        document_ids = flexible_boolean_search(query, inverted_index)
    ranked_doc_ids = rank_documents(document_ids, query, tfidf_matrix, vectorizer)
    relevant_reviews = reviews_df.iloc[ranked_doc_ids] if ranked_doc_ids else pd.DataFrame()
    return relevant_reviews

def rank_documents(document_ids, query, tfidf_matrix, vectorizer):
    query_vector = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix)
    
    # Ensure the document IDs are within the bounds of the TF-IDF matrix
    valid_doc_ids = [doc_id for doc_id in document_ids if doc_id < tfidf_matrix.shape[0]]
    
    ranked_documents = sorted([(doc_id, cosine_sim[0, doc_id]) for doc_id in valid_doc_ids], key=lambda x: x[1], reverse=True)
    return [doc_id for doc_id, _ in ranked_documents]

# Adjust the process_query function to handle empty DataFrame
def process_query(query):
    expanded_query = simplified_expand_query(query)
    document_ids = flexible_boolean_search(expanded_query, inverted_index)
    if not document_ids:
        print("No documents found with expanded query. Trying with the original query...")
        document_ids = flexible_boolean_search(query, inverted_index)
    ranked_doc_ids = rank_documents(document_ids, query, tfidf_matrix, vectorizer)
    relevant_reviews = reviews_df.iloc[ranked_doc_ids] if ranked_doc_ids else pd.DataFrame(columns=['content', 'sentiment'])
    return relevant_reviews

query = "great camera quality"
relevant_reviews = process_query(query)
print(relevant_reviews[['content', 'sentiment']])