In [1]:
import os
import PyPDF2
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Function to extract text from PDF files
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

In [3]:
# Preprocess   
def preprocess_text(text):
    # Remove stop words and apply stemming
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)

In [4]:
# Load job descriptions and extract text from resumes
job_descriptions = pd.read_csv("job_description.csv")
resume_folder = "New folder/"
resumes = []
for filename in os.listdir(resume_folder):
    if filename.endswith(".pdf"):
        resume_text = extract_text_from_pdf(os.path.join(resume_folder, filename))
        resumes.append(resume_text)

In [5]:
job_descriptions

Unnamed: 0,Job Title,Job Description
0,Accountant,Accountants are responsible for preparing and ...
1,Accountant,The role of an accountant includes analyzing f...
2,Accountant,Accountants play a vital role in ensuring that...
3,Accountant,"Working closely with clients, accountants asse..."
4,Accountant,Accountants are tasked with maintaining accura...
...,...,...
101,Software Engineer,"Develop software for autonomous systems, inclu..."
102,Software Engineer,"Contribute to open-source software projects, c..."
103,Software Engineer,Write scripts and automation tools to improve ...
104,Software Engineer,Work with scientific and research teams to dev...


In [6]:
resumes

['ACCOUNTANT\nSummary\nFinancial Accountant specializing in financial planning, reporting and analysis within the Department of Defense.\nHighlights\nAccount reconciliations\nResults-oriented\nFinancial reporting\nCritical thinking\nAccounting operations professional\nAnalysis of financial systems\nERP (Enterprise Resource Planning) software.\nExcellent facilitator\nAccomplishments\nServed on a tiger team which identified and resolved General Ledger postings in DEAMS totaling $360B in accounting adjustments. This allowed\nfor the first successful fiscal year-end close for 2012.\nIn collaboration with DFAS Europe, developed an automated tool that identified duplicate obligations. This tool allowed HQ USAFE to\ndeobligate over $5M in duplicate obligations.\nExperience\nCompany Name\n \nJuly 2011\n \nto \nNovember 2012\n \nAccountant\n \nCity\n \n, \nState\nEnterprise Resource Planning Office (ERO)\nIn this position as an Accountant assigned to the Defense Enterprise Accounting and Manage

In [7]:
# Create DataFrame from resumes
resumes_df = pd.DataFrame({'resume': resumes})

In [8]:
resumes_df

Unnamed: 0,resume
0,ACCOUNTANT\nSummary\nFinancial Accountant spec...
1,STAFF ACCOUNTANT\nSummary\nHighly analytical a...
2,ACCOUNTANT\nProfessional Summary\nTo obtain a ...
3,SENIOR ACCOUNTANT\nExperience\nCompany Name\n ...
4,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...
...,...
113,ACCOUNTANT\nSummary\nSelf-motivated accountant...
114,GENERAL ACCOUNTANT\nCareer Focus\nTo obtain a ...
115,SENIOR ACCOUNTANT\nSummary\nA highly competent...
116,PRINCIPAL ACCOUNTANT\nSummary\nCapable Account...


In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Preprocess data
job_descriptions["processed_text"] = job_descriptions["Job Description"].apply(preprocess_text)
resumes_df["processed_text"] = resumes_df["resume"].apply(preprocess_text)

In [11]:
job_descriptions["processed_text"]

0      account respons prepar examin financi records,...
1      the role account includ analyz financi data, p...
2      account play vital role ensur organ oper effic...
3      work close clients, account assess financi ope...
4      account task maintain accur financi record rep...
                             ...                        
101    develop softwar autonom systems, includ self-d...
102    contribut open-sourc softwar projects, collabo...
103    write script autom tool improv softwar develop...
104    work scientif research team develop softwar co...
105    develop softwar medic devic healthcar systems....
Name: processed_text, Length: 106, dtype: object

In [12]:
resumes_df["processed_text"]

0      account summari financi account special financ...
1      staff account summari highli analyt detail-ori...
2      account profession summari to obtain posit fas...
3      senior account experi compani name june 2011 c...
4      senior account profession summari senior accou...
                             ...                        
113    account summari self-motiv account offer stron...
114    gener account career focu to obtain posit allo...
115    senior account summari a highli competent, mot...
116    princip account summari capabl account success...
117    payrol account summari ha strong work ethic 7+...
Name: processed_text, Length: 118, dtype: object

In [13]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit vectorizer to job descriptions and transform resumes
job_descriptions_vectors = vectorizer.fit_transform(job_descriptions["processed_text"])
resumes_vectors = vectorizer.transform(resumes_df["processed_text"])

In [14]:
# Assuming no labels are available, you can create a dummy label for initial testing
resumes_df['qualified'] = 1  # Replace with actual labels if available

In [15]:
resumes_df

Unnamed: 0,resume,processed_text,qualified
0,ACCOUNTANT\nSummary\nFinancial Accountant spec...,account summari financi account special financ...,1
1,STAFF ACCOUNTANT\nSummary\nHighly analytical a...,staff account summari highli analyt detail-ori...,1
2,ACCOUNTANT\nProfessional Summary\nTo obtain a ...,account profession summari to obtain posit fas...,1
3,SENIOR ACCOUNTANT\nExperience\nCompany Name\n ...,senior account experi compani name june 2011 c...,1
4,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...,senior account profession summari senior accou...,1
...,...,...,...
113,ACCOUNTANT\nSummary\nSelf-motivated accountant...,account summari self-motiv account offer stron...,1
114,GENERAL ACCOUNTANT\nCareer Focus\nTo obtain a ...,gener account career focu to obtain posit allo...,1
115,SENIOR ACCOUNTANT\nSummary\nA highly competent...,"senior account summari a highli competent, mot...",1
116,PRINCIPAL ACCOUNTANT\nSummary\nCapable Account...,princip account summari capabl account success...,1


In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(resumes_vectors, resumes_df['qualified'], test_size=0.2)

In [17]:
# Create and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)