import pandas as pd

# Load the Excel file
file_path = 'university_faq.xlsx'  # Change this to your file path
df = pd.read_excel(file_path)

# Display the first few rows
df.head()


import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
from fuzzywuzzy import fuzz

# 1st

# Load and clean data
file_path = 'university_faq.xlsx'
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# Initialize TF-IDF for questions and keywords
question_vectorizer = TfidfVectorizer()
keyword_vectorizer = TfidfVectorizer()

question_vectors = question_vectorizer.fit_transform(df['Question'])
keyword_vectors = keyword_vectorizer.fit_transform(df['Keyword'])

def correct_spelling(text):
    return str(TextBlob(text).correct())

def get_response(user_input):
    # Check original input
    user_keyword_vector = keyword_vectorizer.transform([user_input])
    keyword_similarities = cosine_similarity(user_keyword_vector, keyword_vectors)
    keyword_match_idx = keyword_similarities.argmax()
    keyword_score = keyword_similarities[0][keyword_match_idx]

    user_question_vector = question_vectorizer.transform([user_input])
    question_similarities = cosine_similarity(user_question_vector, question_vectors)
    question_match_idx = question_similarities.argmax()
    question_score = question_similarities[0][question_match_idx]

    # Fuzzy matching fallback
    max_fuzzy_score = 0
    best_fuzzy_idx = -1
    for idx, keywords in enumerate(df['Keyword']):
        score = fuzz.partial_ratio(user_input.lower(), keywords.lower())
        if score > max_fuzzy_score:
            max_fuzzy_score = score
            best_fuzzy_idx = idx

    # Decision logic for original input
    if keyword_score > 0.4:
        return df.iloc[keyword_match_idx]['Answer']
    elif question_score > 0.5:
        return df.iloc[question_match_idx]['Answer']
    elif max_fuzzy_score > 75:
        return df.iloc[best_fuzzy_idx]['Answer']

    # If no match, try corrected spelling
    corrected_input = correct_spelling(user_input)
    if corrected_input != user_input:
        print(f"Did you mean: '{corrected_input}'?")
        return get_response(corrected_input)

    return "Sorry, I couldn't find a reliable answer. Please try rephrasing your question."

# 2nd (claud)

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
from fuzzywuzzy import fuzz
import re
import string

# First, download required NLTK resources properly
import nltk
try:
    # The correct resource is 'punkt' (not 'punkt_tab')
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    NLTK_AVAILABLE = True
except Exception as e:
    print(f"NLTK download failed: {e}. Falling back to simpler tokenization.")
    NLTK_AVAILABLE = False

# Load and clean data
file_path = 'university_faq.xlsx'
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# Fallback tokenization if NLTK isn't available
def simple_tokenize(text):
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', ' ', text)
    return text.split()

# Text preprocessing function with fallback options
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
        
    # Convert to lowercase and remove punctuation
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', ' ', text)
    
    if NLTK_AVAILABLE:
        try:
            # Tokenize and lemmatize
            lemmatizer = WordNetLemmatizer()
            tokens = word_tokenize(text)
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
            return ' '.join(lemmatized_tokens)
        except Exception as e:
            print(f"NLTK processing error: {e}. Using simple tokenization.")
    
    # Simple fallback if NLTK fails
    tokens = simple_tokenize(text)
    return ' '.join(tokens)

# Preprocess the dataset
df['Processed_Question'] = df['Question'].apply(preprocess_text)
df['Processed_Keyword'] = df['Keyword'].apply(preprocess_text)

# Create expanded dataset with variations (simplified)
def generate_variations(text):
    if not isinstance(text, str):
        return [text] if text else [""]
        
    variations = [text]
    
    # Simple plural/singular handling
    words = text.split()
    for i, word in enumerate(words):
        if word.endswith('s') and len(word) > 3:
            # Try singular form
            singular = word[:-1]
            new_words = words.copy()
            new_words[i] = singular
            variations.append(' '.join(new_words))
        else:
            # Try plural form
            plural = word + 's'
            new_words = words.copy()
            new_words[i] = plural
            variations.append(' '.join(new_words))
    
    # Add common misspellings for university terms
    common_misspellings = {
        'university': ['univercity', 'univarsity'],
        'admission': ['admision', 'admisssion'],
        'scholarship': ['scolarship', 'scholarshipp'],
        'registration': ['registraton', 'registeration'],
        'course': ['cours', 'coarse'],
        'professor': ['professer', 'proffesor'],
        'semester': ['semister', 'semestre'],
        # Add more domain-specific misspellings
    }
    
    for word, misspellings in common_misspellings.items():
        if word in text:
            for misspelling in misspellings:
                variations.append(text.replace(word, misspelling))
                
    return variations

# Initialize TF-IDF vectorizers
question_vectorizer = TfidfVectorizer(
    min_df=1, max_df=0.9,
    ngram_range=(1, 2),  # Include bigrams
    stop_words='english'
)

keyword_vectorizer = TfidfVectorizer(
    min_df=1, max_df=0.9,
    ngram_range=(1, 2),
    stop_words='english'
)

# Fit vectorizers
question_vectors = question_vectorizer.fit_transform(df['Processed_Question'])
keyword_vectors = keyword_vectorizer.fit_transform(df['Processed_Keyword'])

# Build domain dictionary for spelling correction
domain_dictionary = set()
for text in df['Question'].tolist() + df['Keyword'].tolist():
    if isinstance(text, str):
        tokens = simple_tokenize(text)
        domain_dictionary.update(tokens)

def correct_spelling_advanced(text):
    if not isinstance(text, str):
        return str(text) if text else ""
        
    # Simple spell correction with TextBlob
    try:
        corrected = str(TextBlob(text).correct())
        
        # Custom domain-specific corrections
        words = text.lower().split()
        corrected_words = corrected.lower().split()
        
        # Don't "correct" domain-specific terms
        for i, (orig_word, corr_word) in enumerate(zip(words, corrected_words)):
            if orig_word in domain_dictionary and orig_word != corr_word:
                corrected_words[i] = orig_word
        
        return ' '.join(corrected_words)
    except Exception as e:
        print(f"Spelling correction error: {e}")
        return text

# Enhanced response function
def get_response(user_input, confidence_threshold=0.6):
    if not isinstance(user_input, str) or not user_input.strip():
        return "I need a question to help you.", 0.0
    
    # Step 1: Preprocess input
    processed_input = preprocess_text(user_input)
    
    # Step 2: Try with original preprocessed input
    result = match_input(processed_input)
    if result['confidence'] >= confidence_threshold:
        return result['answer'], result['confidence']
    
    # Step 3: Try spelling correction if confidence is low
    corrected_input = correct_spelling_advanced(user_input)
    if corrected_input.lower() != user_input.lower():
        processed_corrected = preprocess_text(corrected_input)
        corrected_result = match_input(processed_corrected)
        if corrected_result['confidence'] > result['confidence']:
            return corrected_result['answer'], corrected_result['confidence'], corrected_input
    
    # Step 4: Try fuzzy matching as another fallback
    fuzzy_result = fuzzy_match(user_input)
    if fuzzy_result['confidence'] > result['confidence']:
        return fuzzy_result['answer'], fuzzy_result['confidence']
    
    # Return best result found, even if confidence is low
    if result['confidence'] > 0.15:  # Minimum threshold
        return result['answer'], result['confidence']
    
    return "I'm not sure I understand. Could you rephrase your question?", 0.0

# Matching function
def match_input(processed_input):
    try:
        # Vector matching
        user_question_vector = question_vectorizer.transform([processed_input])
        question_similarities = cosine_similarity(user_question_vector, question_vectors)
        question_match_idx = question_similarities.argmax()
        question_score = question_similarities[0][question_match_idx]
        
        user_keyword_vector = keyword_vectorizer.transform([processed_input])
        keyword_similarities = cosine_similarity(user_keyword_vector, keyword_vectors)
        keyword_match_idx = keyword_similarities.argmax()
        keyword_score = keyword_similarities[0][keyword_match_idx]
        
        # Determine best match
        if keyword_score > question_score:
            return {
                'answer': df.iloc[keyword_match_idx]['Answer'],
                'confidence': keyword_score,
                'match_type': 'keyword'
            }
        else:
            return {
                'answer': df.iloc[question_match_idx]['Answer'],
                'confidence': question_score,
                'match_type': 'question'
            }
    except Exception as e:
        print(f"Vector matching error: {e}")
        return {'answer': "", 'confidence': 0.0, 'match_type': 'error'}

# Fuzzy matching as fallback
def fuzzy_match(user_input):
    max_score = 0
    best_idx = -1
    
    # Try both question and keyword fuzzy matching
    for idx, row in df.iterrows():
        try:
            question = row.get('Question', '')
            keyword = row.get('Keyword', '')
            
            if isinstance(question, str) and isinstance(keyword, str):
                q_score = fuzz.token_set_ratio(user_input.lower(), question.lower())
                k_score = fuzz.token_set_ratio(user_input.lower(), keyword.lower())
                max_row_score = max(q_score, k_score) / 100  # Normalize to 0-1 scale
                
                if max_row_score > max_score:
                    max_score = max_row_score
                    best_idx = idx
        except Exception as e:
            print(f"Fuzzy matching error on row {idx}: {e}")
    
    if best_idx >= 0:
        return {
            'answer': df.iloc[best_idx]['Answer'],
            'confidence': max_score,
            'match_type': 'fuzzy'
        }
    return {'answer': "", 'confidence': 0.0, 'match_type': 'fuzzy_failed'}

# Example usage
def chatbot_response(user_input):
    try:
        result = get_response(user_input)
        
        # Unpack response based on length
        if len(result) == 3:
            answer, confidence, corrected = result
            if confidence > 0.6:
                return f"Did you mean: '{corrected}'?\n\n{answer}"
            elif confidence > 0.3:
                return f"Did you mean: '{corrected}'?\n\nI think you're asking about: {answer}"
            else:
                return f"Did you mean: '{corrected}'?\n\nI'm not entirely sure, but this might help: {answer}"
        else:
            answer, confidence = result
            if confidence > 0.6:
                return answer
            elif confidence > 0.3:
                return f"I think you're asking about: {answer}"
            else:
                return f"I'm not entirely sure, but this might help: {answer}"
    except Exception as e:
        return f"Sorry, I encountered an error processing your question. Please try again with different wording."

# Test the chatbot with error handling
def run_test():
    print("University FAQ Chatbot (type 'exit' to quit)")
    while True:
        try:
            user_input = input("\nYour question: ")
            if user_input.lower() == 'exit':
                print("Goodbye!")
                break
            response = chatbot_response(user_input)
            print("\nChatbot:", response)
        except Exception as e:
            print(f"Error: {e}")
            print("Let's try again.")

# Uncomment to run interactive test
# run_test()

NLTK processing error: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\arukh/nltk_data'
    - 'C:\\Python312\\nltk_data'
    - 'C:\\Python312\\share\\nltk_data'
    - 'C:\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\arukh\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
. Using simple tokenization.
NLTK processing error: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punk

In [5]:
# Chat loop
while True:
    user_input = input("Ask me something (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    response = get_response(user_input)
    print(response)

Ask me something (type 'exit' to quit):  cse department pograms


The answer to your question 'What are the opportunities for CSE students to collaborate with the electronics department at MUJ?' is currently unavailable. Please refer to official sources for accurate information.


Ask me something (type 'exit' to quit):  cse department programs


xxxyyy


KeyboardInterrupt: Interrupted by user

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
from symspellpy import SymSpell, Verbosity
import re
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load data
file_path = 'university_faq.xlsx'
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# Initialize SymSpell with a pre-built dictionary for better performance
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load the English frequency dictionary
dictionary_file = "./en-80k.txt"
if not sym_spell.load_dictionary(dictionary_file, 0, 1):
    raise FileNotFoundError(f"Dictionary file not found at {dictionary_file}")

# # Load your custom dictionary
# if not sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1):
#     raise FileNotFoundError(f"Custom dictionary file not found at {dictionary_path}")

# Add keywords dynamically (case-insensitive)
keywords = set()
for kw in df['Keyword']:
    if pd.isna(kw):
        logging.warning("Encountered empty keyword entry, skipping.")
        continue
    words = re.findall(r'\b\w+\b', str(kw).lower())
    for word in words:
        keywords.add(word)

# Add extracted keywords to SymSpell dictionary
for word in keywords:
    sym_spell.create_dictionary_entry(word, 1)

# Initialize TF-IDF for questions and keywords
question_vectorizer = TfidfVectorizer()
keyword_vectorizer = TfidfVectorizer()

question_vectors = question_vectorizer.fit_transform(df['Question'])
keyword_vectors = keyword_vectorizer.fit_transform(df['Keyword'])

def correct_spelling(text):
    suggestions = sym_spell.lookup(text, Verbosity.CLOSEST, max_edit_distance=2)
    if suggestions:
        corrected = suggestions[0].term
        if corrected != text:
            logging.info(f"Corrected spelling: '{text}' -> '{corrected}'")
        return corrected
    return text

def get_response(user_input):
    user_input = correct_spelling(user_input)

    # Calculate keyword similarity
    user_keyword_vector = keyword_vectorizer.transform([user_input])
    keyword_similarities = cosine_similarity(user_keyword_vector, keyword_vectors)
    keyword_match_idx = keyword_similarities.argmax()
    keyword_score = keyword_similarities[0][keyword_match_idx]

    # Calculate question similarity
    user_question_vector = question_vectorizer.transform([user_input])
    question_similarities = cosine_similarity(user_question_vector, question_vectors)
    question_match_idx = question_similarities.argmax()
    question_score = question_similarities[0][question_match_idx]

    # Fuzzy matching
    max_fuzzy_score = 0
    best_fuzzy_idx = -1
    for idx, keywords in enumerate(df['Keyword']):
        if pd.isna(keywords):
            continue
        score = fuzz.partial_ratio(user_input.lower(), str(keywords).lower())
        if score > max_fuzzy_score:
            max_fuzzy_score = score
            best_fuzzy_idx = idx

    # Decision logic
    if keyword_score > 0.4:
        return df.iloc[keyword_match_idx]['Answer']
    elif question_score > 0.5:
        return df.iloc[question_match_idx]['Answer']
    elif max_fuzzy_score > 75:
        return df.iloc[best_fuzzy_idx]['Answer']

    return "Sorry, I couldn't find a reliable answer. Please try rephrasing your question."

# Chat loop
while True:
    user_input = input("Ask me something (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    response = get_response(user_input)
    print(response)
    


In [None]:
from symspellpy import SymSpell, Verbosity

# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load the base dictionary (already existing)
dictionary_path = "en-80k.txt"
if not sym_spell.load_dictionary(dictionary_path, 0, 1):
    raise FileNotFoundError(f"Base dictionary not found at {dictionary_path}")

# Create or load the custom dictionary
custom_dict_path = "university_custom_dict.txt"
custom_keywords = {
    "student strength": 100,
    "cse clubs": 90,
    "faculty details": 85,
    "admission process": 80,
    "library hours": 75,
    "event schedule": 70,
    "sports facilities": 65,
    "research opportunities": 60,
    "placement statistics": 55,
    "academic calendar": 50
}

# Write these keywords to the custom dictionary file
with open(custom_dict_path, "w") as f:
    for term, freq in custom_keywords.items():
        f.write(f"{term}\t{freq}\n")

# Load the custom dictionary
if not sym_spell.load_dictionary(custom_dict_path, term_index=0, count_index=1):
    raise FileNotFoundError(f"Custom dictionary file not found at {custom_dict_path}")

# Chatbot interaction
while True:
    user_input = input("Ask me something (type 'exit' to quit): ").strip().lower()
    if user_input == "exit":
        break

    suggestions = sym_spell.lookup(user_input, Verbosity.CLOSEST, max_edit_distance=2)
    if suggestions:
        corrected = suggestions[0].term
        print(f"Did you mean: {corrected}?")
    else:
        print("Sorry, I couldn't find any information about that.")


Ask me something (type 'exit' to quit):  StudDNT Strenht


Sorry, I couldn't find any information about that.


Ask me something (type 'exit' to quit):  cse department program


Sorry, I couldn't find any information about that.
