## Imports

In [1]:
import pandas as pd
import numpy as np
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yousinator/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yousinator/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/yousinator/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## DF

In [2]:
df = pd.read_csv("../data/Final_students_data.csv")

In [3]:
df.head()

Unnamed: 0,Name,Year,Programming Languages,Certifications,Project Count,Extracurricular Activities,Career Interest,Bio,Cluster
0,Laila Mansour,2,Python; R; SQL,Data Science Professional Certificate (IBM),4,AI Club Mentor,Data Scientist,Laila has always been fascinated by the patter...,1
1,Omar Al-Hassan,4,Java; C++; HTML,CompTIA Security+,5,Cybersecurity Competition Participant,Cybersecurity Analyst,A forth-year student at Princess Sumaya Univer...,2
2,Rania Khoury,1,C; Python; JavaScript,Java Programming (Oracle),3,Peer Programming Tutor,Software Developer,Rania entered the University of Jordan's Compu...,0
3,Khaled Abbas,2,SQL; Python; R,Data Science Professional Certificate (IBM),4,Volunteer Tutor,Data Scientist,Khaled has a knack for translating data into m...,1
4,Nour Husseini,3,Python; Java; SQL,CEH,5,Tech Community Member,Cybersecurity Analyst,"In her Third year at Petra University, Nour is...",2


In [4]:
docs = list(df["Bio"])

## Class

In [5]:
class TextMiner:
    def __init__(self, docs, df):
        self.docs = docs
        self.docs = self.tokenize_docs(docs)
        self.docs = self.lowercase(docs)
        self.docs = self.remove_stopwords(docs)
        self.docs = self.remove_punctuation(docs)
        self.docs = self.stemmer(docs)
        self.df = df


    def tokenize_docs(self, docs):

        for x, doc in enumerate(docs):
            token = word_tokenize(doc)
            docs[x] = token
        return docs

    def lowercase(self, docs):
        for x, doc in enumerate(docs):
            docs[x] = [word.lower() for word in doc]
        return docs

    def remove_stopwords(self, docs):
        stop_words = set(stopwords.words('english'))
        for x, doc in enumerate(docs):
            docs[x] = [word for word in doc if word not in stop_words]
        return docs

    def remove_punctuation(self, docs):
        for x, doc in enumerate(docs):
            docs[x] = [word for word in doc if word.isalpha()]
        return docs

    def stemmer(self, docs):
        stemmer = PorterStemmer()
        for x, doc in enumerate(docs):
            docs[x] = [stemmer.stem(word) for word in doc]
        return docs

    def tf_idf(self, docs):
        for x, doc in enumerate(docs):
            docs[x] = " ".join(doc)

        tfidf = TfidfVectorizer()
        processed_docs = tfidf.fit_transform(docs)
        tfidf = pd.DataFrame(processed_docs.toarray(), columns=tfidf.get_feature_names_out(), index = [x for x in range(len(docs))])
        return tfidf

    def process_query(self, query):
        self.query = self.tokenize_docs(query)
        self.query = self.lowercase(query)
        self.query = self.remove_stopwords(query)
        self.query = self.remove_punctuation(query)
        self.query = self.stemmer(query)
        return query[0]

    def rank(self, df):
        cosine_similarity_result = []

        B = np.array(df.iloc[-1])
        B = B.reshape(1,-1)
        for x in range(df.shape[0] - 1):
            A = np.array(df.iloc[x])
            A = A.reshape(1,-1)

            cosine_similarity_result.append(cosine_similarity(A, B))
        sim_df = self.df.copy()
        sim_df["Similarity"] = [float(x.item()) for x in cosine_similarity_result]
        if (min(sim_df["Similarity"]) == 0 and max(sim_df["Similarity"])==0):
            return "There are no matching results"
        sim_df = sim_df[sim_df["Similarity"] != 0]
        percentile = np.percentile(sim_df["Similarity"], 90)
        final = sim_df[sim_df["Similarity"] >= percentile]
        final = sim_df[sim_df["Similarity"] >= (max(sim_df["Similarity"]) / 1.5)]


        return final.sort_values(by="Similarity", ascending=False, kind='heapsort')

    def structure_output(self, final):
        output = []
        for i, row in final.iterrows():
            name = self.df.iloc[i]["Name"]
            bio = self.df.iloc[i]["Bio"]
            similarity = row["Similarity"]
            output.append(f"[{round(similarity,3)}] {i}- {name}: {bio}")
        return output

    def print_output(self, output):
        for i, result in enumerate(output):
            print(result)

    def search(self,query):
        query = self.process_query([query])
        final_docs = self.docs.copy()
        final_docs.append(query)
        final_docs = self.tf_idf(final_docs)
        ranks = self.rank(final_docs)
        if type(ranks) != str:
            output = self.structure_output(ranks)
        else:
            output = "There are no matching results"
        return output, ranks



## Task 1

### Query 1

In [6]:
miner = TextMiner(docs.copy(), df.copy())

output, ranks = miner.search("Web developer with computer Science degree")

miner.print_output(output)
type(ranks)

[0.36] 199- Sara Matar: Sara is a second-year Computer Science student at Petra University. She is part of Women in Computing and aspires to be a Web Developer.
[0.329] 184- Layla Qasim: Layla is a first-year Computer Science student at the University of Jordan, focusing on front-end web development. She is a member of Women in Tech and aims to work as a Front-End Developer.
[0.323] 151- Yousef Talal: Yousef is a second-year Computer Science student at Princess Sumaya University for Technology and a certified web developer. He mentors students in tech skills and plans to be a Software Engineer, developing scalable web applications.
[0.306] 193- Nadia Fares: Nadia is a first-year Computer Science student at Petra University, focusing on front-end development. She is part of the Student Coding Society and aspires to become a Web Developer.
[0.299] 227- Lina Fares: Lina is a first-year Computer Science student at the University of Jordan with an interest in web development and a member of

pandas.core.frame.DataFrame

### Query 2

In [7]:
output, ranks = miner.search("third-year Data Science student")

miner.print_output(output)

[0.633] 239- Rana Fadel: Rana is a senior Data Science student at Al Hussein Technical University, focusing on data science and a member of the Data Science Network.
[0.628] 203- Mahmoud Khaled: Mahmoud is a second-year Data Science student at Jordan University of Science & Technology. He is part of the Student Data Society and aims to work as a Data Scientist.
[0.597] 191- Salma Hasan: Salma is a third-year Data Science student at the University of Jordan. She is part of the Data Science and Innovation club and aims to work as a Data Scientist.
[0.583] 219- Tamer Fares: Tamer is a third-year Data Science student at Petra University with an interest in data engineering. He is part of the Data Science Society.
[0.582] 200- Ali Quraishi: Ali is a third-year Data Science student at the University of Jordan, specializing in data analytics. He is part of the Data Analysis Club and aims to become a Data Analyst.
[0.565] 183- Majid Rashid: Majid is a third-year Data Science student at Petra U

### Query 3

In [8]:
output, ranks = miner.search("second-year student at Al Hussein Technical University")

miner.print_output(output)

[0.473] 165- Salem Hadi: Salem is a second-year Data Science student at Al Hussein Technical University. He is part of the AI Club and aims to work as a Data Analyst.
[0.467] 254- Dalia Samir: Dalia is a second-year Data Science student at Al Hussein Technical University with an interest in AI development, active in the Data Science Society.
[0.465] 195- Lina Jamal: Lina is a third-year Cyber Security student at Al Hussein Technical University, aiming to become a Cybersecurity Analyst. She is part of the Women in Security club.
[0.463] 215- Ibrahim Sameer: Ibrahim is a second-year Computer Science student at Al Hussein Technical University. He is interested in web development and active in the Code Club.
[0.462] 201- Rana Omar: Rana is a forth-year Cyber Security student at Al Hussein Technical University with an interest in security certifications. She aims to work as a Cybersecurity Specialist.
[0.457] 249- Fadi Ayman: Fadi is a second-year Computer Science student at Al Hussein Tech

### Query 4

In [9]:
output, ranks = miner.search("second-year Data Science student at Al Hussein Technical University, in the Data Science Club, and interested in Data Visualization")

miner.print_output(output)

[0.663] 159- Nabil Hassan: Nabil is a second-year Data Science student at Al Hussein Technical University, active in the Data Enthusiasts Club. He is interested in data analytics and aims to work as a Data Analyst.
[0.64] 239- Rana Fadel: Rana is a senior Data Science student at Al Hussein Technical University, focusing on data science and a member of the Data Science Network.
[0.624] 162- Dalia Sayegh: Dalia is a second-year Data Science student at Petra University. She is active in the Data Visualization Society and wants to be a Data Scientist, focusing on data visualizations.
[0.604] 231- Salma Ziad: Salma is a second-year Data Science student at Al Hussein Technical University, interested in data analysis and part of the Data Insights Society.
[0.602] 254- Dalia Samir: Dalia is a second-year Data Science student at Al Hussein Technical University with an interest in AI development, active in the Data Science Society.
[0.597] 222- Alaa Rami: Alaa is a senior Data Science student at

### Query 5

In [10]:
output, ranks = miner.search("Nabil is a second-year Data Science student at Al Hussein Technical University, active in the Data Enthusiasts Club. He is interested in data analytics and aims to work as a Data Analyst.")

miner.print_output(output)

[1.0] 159- Nabil Hassan: Nabil is a second-year Data Science student at Al Hussein Technical University, active in the Data Enthusiasts Club. He is interested in data analytics and aims to work as a Data Analyst.


## Task 2

### User input

In [11]:
print("Hello User! Enter a certification name to get the associated students")
query = input("Enter a Certifcate name")
print("Processing Query..")
print("Results:")
output, ranks = miner.search(query)
output

Hello User! Enter a certification name to get the associated students
Processing Query..
Results:


['[0.269] 239- Rana Fadel: Rana is a senior Data Science student at Al Hussein Technical University, focusing on data science and a member of the Data Science Network.',
 '[0.252] 191- Salma Hasan: Salma is a third-year Data Science student at the University of Jordan. She is part of the Data Science and Innovation club and aims to work as a Data Scientist.',
 '[0.248] 200- Ali Quraishi: Ali is a third-year Data Science student at the University of Jordan, specializing in data analytics. He is part of the Data Analysis Club and aims to become a Data Analyst.',
 '[0.247] 203- Mahmoud Khaled: Mahmoud is a second-year Data Science student at Jordan University of Science & Technology. He is part of the Student Data Society and aims to work as a Data Scientist.',
 '[0.246] 219- Tamer Fares: Tamer is a third-year Data Science student at Petra University with an interest in data engineering. He is part of the Data Science Society.',
 '[0.24] 183- Majid Rashid: Majid is a third-year Data Scien

### Test Case 1

In [12]:
def test_no_matching_bios():
    query = "Quantum"
    output, ranks = miner.search(query)
    if output == "There are no matching results":
        print("Test Passed: 'There are no matching results' returned for query:", query)
    else:
        print(f"Test Failed: Expected 'There are no matching results', but got {ranks}")

test_no_matching_bios()

Test Passed: 'There are no matching results' returned for query: Quantum


### Test Case 2

In [13]:
def test_low_similarity_bios():
    query = "Khaled has a knack for translating data into meaningful insights. As a senior at Philadelphia University, he balances academics with his volunteer tutoring. Known for his analytical mindset and knowledge of SQL and Python, Khaled plans to further his expertise as a Data Scientist, focusing on innovative solutions for data challenges."
    output, ranks = miner.search(query)
    if output != "There are no matching results":
        print(f"Test Passed: Related bios returned for query '{query}'")
        for i, row in ranks.iterrows():
            if row["Similarity"] > (max(ranks["Similarity"]) / 1.5):
                print(f"Bio similarity ({row['Similarity']}) is above threshold.")
            else:
                print(f"Bio similarity ({row['Similarity']}) is below threshold.")
    else:
        print(f"Test Failed: Expected 'There are no matching results', but got {ranks}")

test_low_similarity_bios()

Test Passed: Related bios returned for query 'Khaled has a knack for translating data into meaningful insights. As a senior at Philadelphia University, he balances academics with his volunteer tutoring. Known for his analytical mindset and knowledge of SQL and Python, Khaled plans to further his expertise as a Data Scientist, focusing on innovative solutions for data challenges.'
Bio similarity (1.0) is above threshold.


### Test Case 3

In [14]:
def test_multiple_matching_bios():
    query = "second-year student at Al Hussein Technical University"
    output, ranks = miner.search(query)

    if len(ranks) > 1:
        print(f"Test Passed: Multiple matching bios returned for query '{query}'")
    else:
        print("Test Failed: Expected multiple matching bios but found fewer.")

test_multiple_matching_bios()

Test Passed: Multiple matching bios returned for query 'second-year student at Al Hussein Technical University'


### Test Case 4

In [15]:
def test_exact_match_bios():
    query = "Data Science"
    output, ranks = miner.search(query)

    if output != "There are no matching results":
        exact_matches = [row["Bio"] for i,row in ranks.iterrows() if query in row["Bio"]]
        if len(exact_matches) > 0:
            print(f"Test Passed: Found {len(exact_matches)} exact matches for 'Data Science'.")
        else:
            print("Test Failed: No exact matches found for 'Data Science'.")
    else:
        print("Test Failed: Expected related bios but got 'No related bios.'")

test_exact_match_bios()

Test Passed: Found 32 exact matches for 'Data Science'.


### Test Case 5

In [16]:
def test_ranking():
    query = "third-year Data Science student"
    output, ranks = miner.search(query)

    similarities = [bio["Similarity"] for i,bio in ranks.iterrows()]
    if similarities == sorted(similarities, reverse=True):
        print("Bios are sorted correctly by similarity.")
    else:
        print("Test Failed: Bios are not sorted by similarity.")
test_ranking()



Bios are sorted correctly by similarity.
