In [1]:
from docx import Document
import pandas as pd

In [2]:
docx_file_path = 'C:/Users/abc/OneDrive/Desktop/just1.docx'
doc = Document(docx_file_path)

In [3]:
# Create a text file to write the extracted text, this copies the text from the doc to txt
txt_file_path = 'C:/Users/abc/OneDrive/Desktop/just2.txt'
with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
    for paragraph in doc.paragraphs:
        txt_file.write(paragraph.text + '\n')

print(f'Text extracted from {docx_file_path} and saved to {txt_file_path}')


Text extracted from C:/Users/abc/OneDrive/Desktop/just1.docx and saved to C:/Users/abc/OneDrive/Desktop/just2.txt


In [11]:
#making the lists for the different entries
questions = []
answers = []
tags = []

# Read the text file and iterate through non-empty lines
with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
    lines = [line.strip() for line in txt_file.readlines() if line.strip()]
    
    # Iterate through lines in sets of 3
    for i in range(0, len(lines), 3):
        # Check if there are enough lines left to extract a set
        # currently dependent on the three way format
        if i + 2 < len(lines):
            questions.append(lines[i])
            answers.append(lines[i + 1])
            tags.append(lines[i + 2])
        else:
            # Not enough lines, skip
            print(f"Skipping incomplete set at index {i}.")
            
def remove_numbers(text):
    return ''.join([i for i in text if not i.isdigit()])
df['Question'] = df['Question'].apply(remove_numbers)
# dataFrame from the lists of questions, answers, and tags
df = pd.DataFrame({'Question': questions, 'Answer': answers, 'Tag': tags})
# Applying the function to the desired column
df['Question'] = df['Question'].apply(remove_numbers)
df.head()

Unnamed: 0,Question,Answer,Tag
0,. Is it possible to register for classes if I...,"Yes, you can still register as a non-degree st...",Education
1,. How can I register for classes if I've rece...,You can register online through myIIT by selec...,Education
2,. What is myIIT and how do I access it?,"myIIT is Illinois Tech's portal for students, ...","Technology, Education"
3,. Who should I contact for registration advice?,"For registration advice, contact the Office of...","Education, Administration"
4,. How can I lift a registration hold?,Identify the type of hold by reviewing the Reg...,"Education, Administration"


In [12]:
# Adding another column of similar questions to the dataframe
new_column_values = [
    "Can I still enroll in classes if I missed the application deadline?",
    "How do I enroll in classes after receiving a late admission?",
    "What is the process for accessing the university's online portal, and how do I obtain my login credentials?",
    "Who can provide guidance on registration-related queries?",
    "How can I resolve a hold on my registration?",
    "What steps should I take to modify my course registration, including adding or dropping courses?",
    "What options are available for managing tuition payments for graduate studies?",
    "Is financial assistance accessible to graduate students?",
    "How does one determine full-time status for graduate studies and its implications on financial aid eligibility?",
    "Does the university offer support for career placement and planning?",
    "How can I find part-time on-campus job opportunities?",
    "What academic policies should I be aware of as a graduate student?",
    "How long do I have to complete a degree program?",
    "Can I transfer credits from previous academic work?",
    "How do I apply for graduation?"
]

# Add the new column to the DataFrame
df['similarQ1'] = new_column_values
df.head()


Unnamed: 0,Question,Answer,Tag,similarQ1
0,. Is it possible to register for classes if I...,"Yes, you can still register as a non-degree st...",Education,Can I still enroll in classes if I missed the ...
1,. How can I register for classes if I've rece...,You can register online through myIIT by selec...,Education,How do I enroll in classes after receiving a l...
2,. What is myIIT and how do I access it?,"myIIT is Illinois Tech's portal for students, ...","Technology, Education",What is the process for accessing the universi...
3,. Who should I contact for registration advice?,"For registration advice, contact the Office of...","Education, Administration",Who can provide guidance on registration-relat...
4,. How can I lift a registration hold?,Identify the type of hold by reviewing the Reg...,"Education, Administration",How can I resolve a hold on my registration?


In [26]:
import numpy as np

# Load the Universal Sentence Encoder module
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.load(module_url)

# Define a function to encode a single sentence and flatten the vector
def encode_sentence(sentence):
    return np.array(embed([sentence])).flatten()

# Apply the encoding function to all questions in the 'similarQ1' column and store the vectors in a new column 'USE_vectors'
df['USE_vectors'] = df['Question'].apply(encode_sentence)
df['USE_vectors1'] = df['similarQ1'].apply(encode_sentence)

# Show the DataFrame with the new 'USE_vectors' column
df.head()


Unnamed: 0,Question,Answer,Tag,similarQ1,USE_vectors,USE_vectors1
0,. Is it possible to register for classes if I...,"Yes, you can still register as a non-degree st...",Education,Can I still enroll in classes if I missed the ...,"[0.008187435, 0.07064907, -0.027008953, -0.035...","[0.048688453, 0.04680164, -0.00091497676, -0.0..."
1,. How can I register for classes if I've rece...,You can register online through myIIT by selec...,Education,How do I enroll in classes after receiving a l...,"[0.0075786677, 0.083455324, -0.041204434, -0.0...","[0.06150413, 0.07692546, -0.03356737, -0.06582..."
2,. What is myIIT and how do I access it?,"myIIT is Illinois Tech's portal for students, ...","Technology, Education",What is the process for accessing the universi...,"[-0.0522011, 0.093501106, 0.0103371255, 0.0734...","[-0.006536571, 0.025202103, 0.014788504, -0.01..."
3,. Who should I contact for registration advice?,"For registration advice, contact the Office of...","Education, Administration",Who can provide guidance on registration-relat...,"[-0.06748423, -0.07666019, 0.006175797, 0.0080...","[-0.023623955, -0.03557584, 0.011671241, 0.025..."
4,. How can I lift a registration hold?,Identify the type of hold by reviewing the Reg...,"Education, Administration",How can I resolve a hold on my registration?,"[-0.00717575, -0.09230306, -0.038506247, 0.026...","[-0.02937295, -0.08214458, -0.033817723, -0.00..."


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# Flatten the list of vectors
USE_vectors_flat = [vec.numpy() for sublist in df['USE_vectors'].tolist() for vec in sublist]
USE_vectors1_flat = [vec.numpy() for sublist in df['USE_vectors1'].tolist() for vec in sublist]

# Calculate cosine similarity between corresponding vectors
similarities = cosine_similarity(USE_vectors_flat, USE_vectors1_flat)

# Print the cosine similarity matrix
print(similarities)


[[0.8370929  0.75848764 0.29173374 0.07032148 0.4301744  0.41000473
  0.26744795 0.40035167 0.32672834 0.2744723  0.33734936 0.21484141
  0.31958973 0.4911772  0.38174474]
 [0.56726176 0.744249   0.5318858  0.12805827 0.52866346 0.47430924
  0.31355572 0.36282086 0.3413555  0.24427423 0.3507126  0.31073266
  0.27786985 0.31443337 0.5556874 ]
 [0.08283404 0.12407808 0.4016714  0.22640897 0.18774122 0.0916725
  0.0863776  0.17796247 0.19980797 0.05651904 0.14066006 0.18745027
  0.10939853 0.04836256 0.24453472]
 [0.25297135 0.3698479  0.29641518 0.4106421  0.64195955 0.43667614
  0.21774921 0.16340032 0.15726201 0.09156989 0.19881973 0.21735194
  0.17194805 0.11473186 0.32380965]
 [0.24925134 0.3542298  0.26030695 0.16360605 0.70473623 0.40842062
  0.12946092 0.13041237 0.1140867  0.0066739  0.11623564 0.118486
  0.13472018 0.12575659 0.29631877]
 [0.5344184  0.6448054  0.40927246 0.12901235 0.4888907  0.77547204
  0.36434704 0.21810351 0.31576186 0.23301417 0.29137155 0.29163468
  0.354