# Job Recommendation System

In [294]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import os
import csv
import sys

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

sb.set_style('darkgrid')

In [295]:
!pip install --upgrade openpyxl

Defaulting to user installation because normal site-packages is not writeable






In [296]:
cols = ['date_posted', 'company', 'linkedln_profile', 'location', 'country_code', 'industry', 'position', 'job_description', 'link', 'salary']
df = pd.read_csv(r"D:\Open Classroom\Datasets\Job Recommendation\Job Suggestions.csv", names=cols, encoding="latin1")
df.head()


Unnamed: 0,date_posted,company,linkedln_profile,location,country_code,industry,position,job_description,link,salary
0,"May 13, 2015 at 11:20AM",TalentOne.in (Formerly HireAtEase),http://www.linkedin.com/company/201192,"Bengaluru Area, India",IN,Computer Software,Java + UI Developer/Lead/Architect,Summary The candidate will be involved in requ...,https://www.linkedin.com/jobs?viewJob=&jobId=4...,No salary provided
1,"May 12, 2015 at 02:15PM",Crossover.,http://www.linkedin.com/company/9308035,"Bengaluru Area, India",IN,Human Resources,Global IT Sourcer - $30k,If you are an energetic and dynamic individual...,https://www.linkedin.com/jobs?viewJob=&jobId=4...,No salary provided
2,"May 06, 2015 at 11:50AM",7C Studio,http://www.linkedin.com/company/3765462,"Bengaluru Area, India",IN,Computer Software,Mobile Application Developer,7C Studio is looking for a Senior Android/iOS ...,https://www.linkedin.com/jobs?viewJob=&jobId=6...,No salary provided
3,"May 15, 2015 at 11:00PM",Planetsurf Creations,http://www.linkedin.com/company/2532948,"Bengaluru Area, India",IN,"Internet, Marketing and Advertising",Junior Web Developer,"Candidate be involved in development, testing,...",https://www.linkedin.com/jobs?viewJob=&jobId=4...,No salary provided
4,"May 06, 2015 at 11:50AM",7C Studio,http://www.linkedin.com/company/3765462,"Bengaluru Area, India",IN,Computer Software,Mobile Application Developer,7C Studio is looking for a Senior Android/iOS ...,https://www.linkedin.com/jobs?viewJob=&jobId=6...,No salary provided


In [297]:

# Path to your folder containing the files
folder_path = r"D:/Open Classroom/Datasets/Job Recommendation/"

# List all CSV files in the directory
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
print("Files found:", files)

# Initialize an empty DataFrame for concatenation
all_job_files = pd.DataFrame()

# Iterate through each file and append to the DataFrame
for file in files:
    file_path = os.path.join(folder_path, file)  # Construct full file path
    df = pd.read_csv(file_path, names=cols, encoding='latin1')  # Read CSV
    all_job_files = pd.concat([all_job_files, df], ignore_index=True)  # Concatenate with existing data

# Save the combined DataFrame to a new CSV file
output_path = r"D:/Open Classroom/Datasets/Job Recommendation/all_jobs_listings.csv"
all_job_files.to_csv(output_path, index=False)
print(f"Combined DataFrame saved to {output_path}")

# Display the first few rows of the combined DataFrame
all_job_files.head()


Files found: ['all_jobs_listings.csv', 'Job Suggestions(1).csv', 'Job Suggestions(2).csv', 'Job Suggestions(3).csv', 'Job Suggestions.csv']
Combined DataFrame saved to D:/Open Classroom/Datasets/Job Recommendation/all_jobs_listings.csv


Unnamed: 0,date_posted,company,linkedln_profile,location,country_code,industry,position,job_description,link,salary
0,date_posted,company,linkedln_profile,location,country_code,industry,position,job_description,link,salary
1,"April 03, 2015 at 02:11PM",The Better India,http://www.linkedin.com/company/702633,"Bengaluru Area, India",IN,Internet,Associate Editor,Responsibilities: Work with our staff writers ...,https://www.linkedin.com/jobs?viewJob=&jobId=4...,No salary provided
2,"April 11, 2015 at 05:20PM",Teritree Technologies,http://www.linkedin.com/company/2518710,"Bengaluru Area, India",IN,Computer Software,Sales and Marketing Superstar,- Formulation and execution of sales strategyÂ...,https://www.linkedin.com/jobs?viewJob=&jobId=4...,No salary provided
3,"April 28, 2015 at 03:00PM",Zoomo India,http://www.linkedin.com/company/9195690,"Bengaluru Area, India",IN,Internet,Visual Designer,Work with design and product management to cre...,https://www.linkedin.com/jobs?viewJob=&jobId=4...,No salary provided
4,"April 28, 2015 at 12:52PM",Symmetrical Global Search Pvt Ltd,http://www.linkedin.com/company/233698,"Bengaluru Area, India",IN,"Telecommunications, Information Technology and...",Senior Solution Architect - Voice SME,Senior Solution Architect - Voice SME Â will b...,https://www.linkedin.com/jobs?viewJob=&jobId=4...,No salary provided


In [298]:
all_job_files.shape

(8477, 10)

In [299]:
df.isnull().sum()

date_posted         0
company             0
linkedln_profile    0
location            0
country_code        0
industry            0
position            0
job_description     0
link                0
salary              0
dtype: int64

In [300]:

def clean_data(x):
    return str.lower(x.replace(" ", ""))

def create_soup(x):
    return x['company'] + ' ' + x['location'] + ' ' + x['industry'] + ' ' + x['position'] + ' ' + x['salary']

In [301]:
def recommend_job_general(job_post, cosine_sim):
    if job_post not in indices:
        raise ValueError(f'the job post {job_post} is not in the list')
    
    global result
    job_post=job_post.replace(' ','').lower()
    idx = indices[job_post]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[:10]
    job_indices = [i[0] for i in sim_scores]
    result =  all_job_files[['date_posted','company', 'linkedln_profile', 'location', 'country_code', 'industry', 'position', 'job_desc', 'link', 'salary']].iloc[job_indices]
    result.reset_index(drop=True, inplace=True)
    
    print(f"Shape of cosine_sim: {cosine_sim.shape}")
    print(f"Index obtained: {idx}")
    print(f"Row of cosine_sim[idx]: {cosine_sim[idx]}")

    return result


In [302]:
all_job_files = all_job_files.fillna('')

new_features = ['company', 'location', 'industry', 'position', 'salary']
new_data = all_job_files[new_features]

for i in new_data:
    new_data[i] = all_job_files[i].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data[i] = all_job_files[i].apply(clean_data)


In [303]:
new_data['soup'] = new_data.apply(create_soup, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['soup'] = new_data.apply(create_soup, axis=1)


In [304]:
new_data.head()

Unnamed: 0,company,location,industry,position,salary,soup
0,company,location,industry,position,salary,company location industry position salary
1,thebetterindia,"bengaluruarea,india",internet,associateeditor,nosalaryprovided,"thebetterindia bengaluruarea,india internet as..."
2,teritreetechnologies,"bengaluruarea,india",computersoftware,salesandmarketingsuperstar,nosalaryprovided,"teritreetechnologies bengaluruarea,india compu..."
3,zoomoindia,"bengaluruarea,india",internet,visualdesigner,nosalaryprovided,"zoomoindia bengaluruarea,india internet visual..."
4,symmetricalglobalsearchpvtltd,"bengaluruarea,india","telecommunications,informationtechnologyandser...",seniorsolutionarchitect-voicesme,nosalaryprovided,"symmetricalglobalsearchpvtltd bengaluruarea,in..."


In [305]:
new_data['soup'].iloc()[0]

'company location industry position salary'

In [306]:
df['salary'].value_counts()

salary
No salary provided      619
50-60 Lacs                6
Best In Industry          6
INR Rs 1.5 Lakh / Yr      5
Name: count, dtype: int64

In [307]:
new_data.shape

(8477, 6)

In [308]:
count_vect = CountVectorizer(stop_words='english')

count_soup_matrix = count_vect.fit_transform(new_data['soup'])

In [309]:
count_soup_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 62017 stored elements and shape (8477, 715)>

In [310]:
cos_sim = cosine_similarity(count_soup_matrix, count_soup_matrix)

In [311]:
cos_sim[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [312]:
# new_data = new_data.reset_index()

In [313]:
# new_data.head()

In [314]:
indices = pd.Series(new_data.index, index=new_data['position'])

In [315]:
indices['salesandmarketingsuperstar']

position
salesandmarketingsuperstar       2
salesandmarketingsuperstar     127
salesandmarketingsuperstar     179
salesandmarketingsuperstar     183
salesandmarketingsuperstar     433
salesandmarketingsuperstar     698
salesandmarketingsuperstar     700
salesandmarketingsuperstar    1708
salesandmarketingsuperstar    2469
salesandmarketingsuperstar    2470
salesandmarketingsuperstar    2560
salesandmarketingsuperstar    2562
salesandmarketingsuperstar    2693
salesandmarketingsuperstar    2696
salesandmarketingsuperstar    2794
salesandmarketingsuperstar    2798
salesandmarketingsuperstar    2961
salesandmarketingsuperstar    2963
salesandmarketingsuperstar    3077
salesandmarketingsuperstar    3079
salesandmarketingsuperstar    3193
salesandmarketingsuperstar    3348
salesandmarketingsuperstar    3432
salesandmarketingsuperstar    3435
salesandmarketingsuperstar    3539
salesandmarketingsuperstar    3993
salesandmarketingsuperstar    3997
salesandmarketingsuperstar    4090
salesandmar

In [316]:
cos_sim[4093]

array([0.        , 0.46291005, 0.46291005, ..., 0.29277002, 0.28347335,
       0.46291005])

In [317]:
sorted(list(enumerate(cos_sim[indices['associateeditor']])))[:10]

[(0,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (1,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (2,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (3,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (4,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (5,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (6,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (7,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (8,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.30618622,
         0.5       ])),
 (9,
  array([0.        , 1.        , 0.5       , ..., 0.31622777, 0.3061

In [320]:
recommend_job_general('associateeditor', cos_sim)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()