## Importing libraries

In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv('../data/raw/resume_data_1.csv')

### Dropping irrelevant columns:

In [18]:
df = df.drop(columns = ["responsibilities","address", "locations", "extra_curricular_organization_links", "online_links", "issue_dates", "expiry_dates", "company_urls"])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9544 entries, 0 to 9543
Data columns (total 27 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   career_objective                     4740 non-null   object 
 1   skills                               9488 non-null   object 
 2   educational_institution_name         9460 non-null   object 
 3   degree_names                         9460 non-null   object 
 4   passing_years                        9460 non-null   object 
 5   educational_results                  9460 non-null   object 
 6   result_types                         9460 non-null   object 
 7   major_field_of_studies               9460 non-null   object 
 8   professional_company_names           9460 non-null   object 
 9   start_dates                          9460 non-null   object 
 10  end_dates                            9460 non-null   object 
 11  related_skils_in_job          

Separating features into resume-related features and job-related features:

In [20]:
resumes = df.iloc[:, :20]
jobs = df.iloc[:, 20:]

In [21]:
resumes = resumes.drop_duplicates().reset_index()
resumes.shape

(344, 21)

In [22]:
# dropping matched score col
jobs = jobs.drop(columns=['matched_score'])

In [23]:
jobs = jobs.drop_duplicates().reset_index()

In [24]:
jobs.shape

(28, 7)

We have 28 Unique jobs and 344 unique resumes.

### Creating a long format string for each resume and job - to use for similarity matching:

In [25]:
resume_strings = resumes.apply(lambda row: " ".join(row.astype(str).values), axis=1)
resume_strings.head()

0    0 Big data analytics working and database ware...
1    1 Fresher looking to join as a data analyst an...
2    2 nan ['Software Development', 'Machine Learni...
3    3 To obtain a position in a fast-paced busines...
4    4 Professional accountant with an outstanding ...
dtype: object

In [26]:
job_strings = jobs.apply(lambda row: " ".join(row.astype(str).values), axis=1)
job_strings.head()

0    0 Senior Software Engineer B.Sc in Computer Sc...
1    1 Machine Learning (ML) Engineer M.Sc in Compu...
2    2 Executive/ Senior Executive- Trade Marketing...
3    3 Business Development Executive Bachelor/Hono...
4    4 Senior iOS Engineer Bachelor of Science (BSc...
dtype: object

Removing index from the front of the strings

In [27]:
resume_strings = resume_strings.apply(lambda row: row[2:])

In [28]:
resumes = pd.DataFrame({'res_id': range(0, 344), 'data': resume_strings.values})
resumes.head()

Unnamed: 0,res_id,data
0,0,Big data analytics working and database wareho...
1,1,Fresher looking to join as a data analyst and ...
2,2,"nan ['Software Development', 'Machine Learning..."
3,3,To obtain a position in a fast-paced business ...
4,4,Professional accountant with an outstanding wo...


In [29]:
job_strings = job_strings.apply(lambda row: row[2:])

In [30]:
jobs = pd.DataFrame({'job_id': range(0, 28), 'data': job_strings.values})
jobs.head()

Unnamed: 0,job_id,data
0,0,Senior Software Engineer B.Sc in Computer Scie...
1,1,Machine Learning (ML) Engineer M.Sc in Compute...
2,2,"Executive/ Senior Executive- Trade Marketing, ..."
3,3,Business Development Executive Bachelor/Honors...
4,4,Senior iOS Engineer Bachelor of Science (BSc) ...


In [33]:
# resume_strings = pd.read_csv('...')
# job_strings = pd.read_csv('...')

### Using tfidf to vectorize resume and job strings:

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
tfidf = TfidfVectorizer()

In [36]:
resume_tfidf = tfidf.fit_transform(resume_strings.values)
job_tfidf = tfidf.transform(job_strings.values)

print(resume_tfidf.shape)
print(job_tfidf.shape)

(344, 4961)
(28, 4961)


We have total of 4961 terms in the dataset.

### Using Cosine Similarity to recommend best jobs for a resume:

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(resume_tfidf, job_tfidf)

In [38]:
top_n = 3
recommendations = []

for i in range(5):
    top_jobs = sim_matrix[i].argsort()[::-1][:top_n]
    recommendations.append({
        "resume_id": resumes.loc[i, "res_id"],
        "top_recommendations": [jobs.loc[j, "job_id"] for j in top_jobs]
    })

In [39]:
print(f"Sample Resumes and Their Top-3 Job Recommendations:\n{pd.DataFrame(recommendations)}")

Sample Resumes and Their Top-3 Job Recommendations:
   resume_id top_recommendations
0          0         [20, 19, 1]
1          1          [20, 1, 0]
2          2          [24, 0, 8]
3          3         [8, 11, 23]
4          4        [15, 21, 16]


So from our recommendation, for resume_id = 0, it suggests job_id = (20, 19, 1)