In [1]:
import pandas as pd

# Load the CSV file
qa_df = pd.read_csv('D:/um/nlp/FCSITCareerBuddy/dataset/faq_data.csv')

# Preview the DataFrame
print(qa_df.head())

                                           question  \
0               What is intern, definition, meaning   
1                  Why do I need to intern, benefit   
2  Who do I need to seek approval for my internship   
3                    Who are department coordinator   
4                       Where can I intern location   

                                              answer  
0  An intern is a participant in an industrial tr...  
1  Internships provide invaluable industrial expe...  
2  Approval for internships is obtained from depa...  
3  - Artifical Intelligence: DR. LIEW WEI SHIUNG ...  
4  Interns can work at the physical location of a...  


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load pre-trained model and tokenizer
model_name = "t5-small"  # You can choose other models like 't5-base' or 'facebook/bart-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

# Vectorize the questions in the dataset
tfidf_matrix = vectorizer.fit_transform(qa_df['question'])

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Get the default stopwords list
    default_stopwords = set(stopwords.words('english'))

    # Words to remove from the stopwords list
    words_to_keep = {'how', 'when', 'what','who','why','before','during','after'}

# Customize stopwords by removing specific words
    final_stopwords = default_stopwords - words_to_keep

        # Removing stopwords
    tokens = [token for token in tokens if token not in final_stopwords]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]  # Apply lower() to each token
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SCSM11\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SCSM11\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SCSM11\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def get_most_similar_question(input_question, threshold=0.001):
    # Vectorize the input question
    input_vector = vectorizer.transform([input_question])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(input_vector, tfidf_matrix)
    
    # Get the index of the most similar question
    most_similar_index = similarities.argmax()
    most_similar_score = similarities[0, most_similar_index]
    
    if most_similar_score < threshold:
        return {
            "answer": "Sorry, the answer to that question is not available in the data. Please ask your internship coordinator for the answer.",
            "question": ""
        }
    
    return qa_df.iloc[most_similar_index]

In [27]:
question = "intern is what"
preprocessed_question = preprocess_text(question)
print("Preprocessed Question:", preprocessed_question)

most_similar_question = get_most_similar_question(preprocessed_question )
print(f"Input Question: {question }")
print(f"Most Similar Question: {most_similar_question['question']}")
print(f"Answer: {most_similar_question['answer']}\n")

Preprocessed Question: intern what
Input Question: intern is what
Most Similar Question: What is intern, definition, meaning
Answer: An intern is a participant in an industrial training course, typically involving 12 credit hours of coursework and practical experience within a company.

