In [7]:
import pandas as pd

# Load the CSV file
qa_df = pd.read_csv('D:/um/nlp/FCSITCareerBuddy/dataset/faq_data.csv')

# Preview the DataFrame
print(qa_df.head())

                                            question  \
0                                    What is intern?   
1                           Why do I need to intern?   
2  Who do I need to seek approval for my internship?   
3                                Where can I intern?   
4                    What is the duration of intern?   

                                              answer  
0  An intern is a participant in an industrial tr...  
1  Internships provide invaluable industrial expe...  
2  Approval for internships is obtained from depa...  
3  Interns can work at the physical location of a...  
4  Internships typically last between 24 to 26 we...  


In [12]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load your dataset
file_path = 'D:/um/nlp/FCSITCareerBuddy/dataset/faq_data.csv'
qa_df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(qa_df.head())

# Load the T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)



                                            question  \
0                                    What is intern?   
1                           Why do I need to intern?   
2  Who do I need to seek approval for my internship?   
3                                Where can I intern?   
4                    What is the duration of intern?   

                                              answer  
0  An intern is a participant in an industrial tr...  
1  Internships provide invaluable industrial expe...  
2  Approval for internships is obtained from depa...  
3  Interns can work at the physical location of a...  
4  Internships typically last between 24 to 26 we...  




ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = [f"paraphrase: {q1} </s> {q2}" for q1, q2 in zip(examples['sentence1'], examples['sentence2'])]
    targets = ["paraphrase" if label else "not paraphrase" for label in examples['label']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-t5")
tokenizer.save_pretrained("./fine-tuned-t5")

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

# Vectorize the questions in the dataset
tfidf_matrix = vectorizer.fit_transform(qa_df['question'])

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Get the default stopwords list
    default_stopwords = set(stopwords.words('english'))

    # Words to remove from the stopwords list
    words_to_keep = {'how', 'when', 'what','who','why','before','during','after'}

# Customize stopwords by removing specific words
    final_stopwords = default_stopwords - words_to_keep

        # Removing stopwords
    tokens = [token for token in tokens if token not in final_stopwords]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]  # Apply lower() to each token
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SCSM11\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SCSM11\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SCSM11\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
def get_most_similar_question(input_question, threshold=0.001):
    # Vectorize the input question
    input_vector = vectorizer.transform([input_question])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(input_vector, tfidf_matrix)
    
    # Get the index of the most similar question
    most_similar_index = similarities.argmax()
    most_similar_score = similarities[0, most_similar_index]
    
    if most_similar_score < threshold:
        return {
            "answer": "Sorry, the answer to that question is not available in the data. Please ask your internship coordinator for the answer.",
            "question": ""
        }
    
    return qa_df.iloc[most_similar_index]

In [10]:
question = "intern is what"
preprocessed_question = preprocess_text(question)
print("Preprocessed Question:", preprocessed_question)

most_similar_question = get_most_similar_question(preprocessed_question )
print(f"Input Question: {question }")
print(f"Most Similar Question: {most_similar_question['question']}")
print(f"Answer: {most_similar_question['answer']}\n")

Preprocessed Question: intern what
Input Question: intern is what
Most Similar Question: What is intern?
Answer: An intern is a participant in an industrial training course, typically involving 12 credit hours of coursework and practical experience within a company.

