# Text Preprocessing

In [40]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

# Load the data
reviews = pd.read_csv('ai_data.csv')

# Drop the 'ID' column
reviews = reviews.drop(columns=['ID'])

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove special characters and line breaks
    text = re.sub(r'([^\s\w_])+', ' ', text)
    text = re.sub(r'[\n\r]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', ' ', text)
    return text

# Apply preprocessing to 'Question' column
reviews['Question_processed'] = reviews['Question'].apply(preprocess_text)

# Function to tokenize text
def tokenize_text(text):
    text = text.lower()
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    words = tokenizer.tokenize(text)
    return words

# Function to remove stopwords
def remove_stopwords(words):
    stop = set(stopwords.words('english'))
    filtered_words = [w for w in words if w not in stop]
    return filtered_words

# Function to perform lemmatization
def lemmatize_text(words):
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(word, get_part_of_speech_tags(word)) for word in words]
    return lem_words

# Function to get Part of Speech tags
def get_part_of_speech_tags(word):
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return tag_dict.get(tag, wordnet.NOUN)

# Preprocess 'Question' column
reviews['Question_processed'] = reviews['Question_processed'].apply(tokenize_text)
reviews['Question_processed'] = reviews['Question_processed'].apply(remove_stopwords)
reviews['Question_processed'] = reviews['Question_processed'].apply(lemmatize_text)

# Convert tokens back to string
reviews['Question_processed'] = reviews['Question_processed'].apply(lambda x: ' '.join(x))

reviews.head(4)


Unnamed: 0,Question,Answer,Question_processed
0,What is artificial intelligence?,Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and learn like humans.,artificial intelligence
1,What are the two types of AI?,The two types of AI are narrow AI (also known as weak AI) and general AI (also known as strong AI).,two type ai
2,What is narrow AI?,Narrow AI is AI that is designed and trained for a particular task or set of tasks. It operates within a limited context and cannot perform tasks outside.,narrow ai
3,What is general AI?,"General AI is AI that has the ability to understand, learn, and apply its intelligence across a wide range of tasks, similar to human intelligence.",general ai


# Topic Name extraction

In [41]:

# Function to convert text to words
def text_to_words(texts):
    return [[word for word in simple_preprocess(str(text), deacc=True)]
            for text in texts]

# Function to extract topics using LDA model and return topic numbers
def extract_topics(text):
    text_words = text_to_words([text])
    # Create corpus
    corpus_vec = [dict_word.doc2bow(words) for words in text_words]
    # Get topic distribution
    topics = lda_model.get_document_topics(corpus_vec[0])
    #print("Topics inside extract topics is **************", topics)
    # Extract most probable topic
    topic_num = max(topics, key=lambda x: x[1])[0]
    return topic_num


# Function to get topic names based on representative words
def infer_topic_names(lda_model, dict_word, num_words=3):
    topic_names = {}
    for i in range(lda_model.num_topics):
        #print("i is", i)
        words = lda_model.show_topic(i, topn=num_words)
        #print("words is", words)
        topic_names[i] = ', '.join([word for word, _ in words])
        #print("topic names ",topic_names)
    return topic_names

# Training LDA model and getting dictionary

In [42]:

text = reviews.Question.values.tolist()
text_words = text_to_words(text)
dict_word = corpora.Dictionary(text_words)
dict_word.filter_extremes(no_below=2, no_above=0.1, keep_n=2000)
corpus_vec = [dict_word.doc2bow(text) for text in text_words]

# Train LDA model
lda_model = gensim.models.LdaModel(corpus=corpus_vec, id2word=dict_word, num_topics=3, iterations=20)

lda_model.save('lda_model')


# Displaying Output for trained LDA model

In [43]:
# Get inferred topic names
topic_names = infer_topic_names(lda_model, dict_word)

# Apply topic extraction function to each question
reviews['Topic_Num'] = reviews['Question_processed'].apply(extract_topics)

# Map topic numbers to inferred topic names
reviews['Topic'] = reviews['Topic_Num'].map(topic_names)

# Drop the 'Topic_Num' column if needed
reviews = reviews.drop(columns=['Topic_Num'])
pd.set_option('display.max_colwidth', 300)
reviews.head(20)

Unnamed: 0,Question,Answer,Question_processed,Topic
0,What is artificial intelligence?,Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and learn like humans.,artificial intelligence,"purpose, regularization, intelligence"
1,What are the two types of AI?,The two types of AI are narrow AI (also known as weak AI) and general AI (also known as strong AI).,two type ai,"explain, challenges, concept"
2,What is narrow AI?,Narrow AI is AI that is designed and trained for a particular task or set of tasks. It operates within a limited context and cannot perform tasks outside.,narrow ai,"it, how, important"
3,What is general AI?,"General AI is AI that has the ability to understand, learn, and apply its intelligence across a wide range of tasks, similar to human intelligence.",general ai,"explain, challenges, concept"
4,What are some examples of narrow AI?,"Some examples of narrow AI include virtual personal assistants (e.g., Siri, Alexa), recommendation systems, and image recognition software.",example narrow ai,"it, how, important"
5,What are some challenges in AI?,"Some challenges in AI include ethical considerations, job displacement, and ensuring AI systems are transparent and accountable.",challenge ai,"explain, challenges, concept"
6,What is machine learning?,Machine learning is a subset of AI that allows computers to learn from data and improve over time without being explicitly programmed.,machine learn,"explain, challenges, concept"
7,What are some popular machine learning algorithms?,"Some popular machine learning algorithms include linear regression, logistic regression, decision trees, random forests, support vector machines (SVM), and neural networks.",popular machine learn algorithm,"purpose, regularization, intelligence"
8,What is deep learning?,Deep learning is a subset of machine learning that uses neural networks with many layers to learn complex patterns in large amounts of data.,deep learn,"purpose, regularization, intelligence"
9,What are some applications of AI?,"Some applications of AI include natural language processing (NLP), computer vision, autonomous vehicles, healthcare, and finance.",application ai,"explain, challenges, concept"


# Testing the user data

In [67]:
# Load the saved LDA model
lda_model = gensim.models.LdaModel.load('lda_model')

# Load the new data
user_input = "artificial intelligence?"

new_data = pd.DataFrame(data=[user_input], columns=['Question'])
new_data.head()


Unnamed: 0,Question
0,artificial intelligence?


In [68]:
# Preprocess the new data
# (Assuming you have defined preprocessing functions similar to the ones used for training)

# Tokenize, preprocess, and vectorize the new data using the same dictionary as used during training
new_data['Question_processed'] = new_data['Question'].apply(preprocess_text)
new_data['Question_processed'] = new_data['Question_processed'].apply(tokenize_text)
new_data['Question_processed'] = new_data['Question_processed'].apply(remove_stopwords)
new_data['Question_processed'] = new_data['Question_processed'].apply(lemmatize_text)
new_data['Question_processed'] = new_data['Question_processed'].apply(lambda x: ' '.join(x))

# Convert new data to bag-of-words representation using the same dictionary
new_text = new_data.Question.values.tolist()
new_text_words = text_to_words(new_text)
new_corpus_vec = [dict_word.doc2bow(text) for text in new_text_words]

# Infer topics for the new data
new_topics = [max(lda_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in new_corpus_vec]

# Interpret the inferred topics for the new data
new_data['Topic_Num'] = new_topics
new_data['Topic'] = new_data['Topic_Num'].map(topic_names)

user_output = new_data[['Question','Question_processed','Topic']]
# Display DataFrame with inferred topics for new data
user_output.head()

Unnamed: 0,Question,Question_processed,Topic
0,artificial intelligence?,artificial intelligence,"purpose, regularization, intelligence"


# Testing Similarity

In [72]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to compute cosine similarity
def compute_cosine_similarity(query_vector, corpus_vectors):
    similarities = cosine_similarity([query_vector], corpus_vectors)
    return similarities.flatten()

# Filter reviews based on the inferred topic from new_data
filtered_reviews = reviews[reviews['Topic'] == user_output['Topic'].iloc[0]]

# Convert new_data question to bag-of-words representation using the same dictionary
new_question_bow = dict_word.doc2bow(user_output['Question_processed'].iloc[0].split())

# Convert filtered reviews questions to bag-of-words representation using the same dictionary
filtered_reviews_bow = [dict_word.doc2bow(text.split()) for text in filtered_reviews['Question_processed']]



In [73]:
# Get vocabulary size
vocab_size = len(dict_word)

# Convert bag-of-words representations to dense vectors
new_question_vector = np.zeros(vocab_size)
for idx, count in new_question_bow:
    new_question_vector[idx] = count

filtered_reviews_vectors = np.zeros((len(filtered_reviews_bow), vocab_size))
for i, review_bow in enumerate(filtered_reviews_bow):
    for idx, count in review_bow:
        filtered_reviews_vectors[i][idx] = count

In [74]:
# Compute cosine similarity between new question and filtered reviews questions
cosine_similarities = compute_cosine_similarity(new_question_vector, filtered_reviews_vectors)

# Find the index of the review with the highest cosine similarity
max_similarity_index = np.argmax(cosine_similarities)

# Get the answer corresponding to the question with the highest similarity
best_matched_answer = filtered_reviews.iloc[max_similarity_index]['Answer']

print("Best matched answer:", best_matched_answer)

Best matched answer: Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and learn like humans.
