#  **Install the required packages**

In [None]:
!pip install PyPDF2 python-docx
!pip install python-docx pymupdf
!pip install docx2txt
!pip install pdfminer.six
!pip install pymupdf
!pip install spacy==2.3.5
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
!pip install pyresparser
!pip install tabula-py
!pip install resume-parser

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/232.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.1.0
Collecting pymupdf
  Downloading PyMuPDF-1.23.7-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━

# **Import Natural Language Toolkit**

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

# **Import ResumeParser**

In [None]:
from pyresparser import ResumeParser
import warnings

In [None]:
warnings.filterwarnings("ignore", category=UserWarning)

## ****Extracting data from resume****

## 15 resumes for testing

In [None]:
import os
import csv
from resume_parser import resumeparse

def extract_name_from_filename(filename):
    # This is a simplified example. You might need a more sophisticated logic based on your filename format.
    # For example, if filenames are in the format "FirstName_LastName_Resume.docx"
    # you might want to split on underscores and take the first and last parts as the first and last name.
    # Adjust this function based on your actual filename format.
    return os.path.splitext(filename)[0]  # Just using the filename without extension as an example

def save_to_csv(output_file, data, filename):
    fieldnames = ["Filename", "FullName", "Name", "Email", "Mobile Number", "Skills", "College Name", "Degree", "Designation", "Company Names", "Total Experience"]

    # Check if the file already exists
    file_exists = os.path.isfile(output_file)

    with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # If the file doesn't exist, write the header
        if not file_exists:
            writer.writeheader()

        # Extract name from the filename
        full_name = extract_name_from_filename(filename)

        # Write data
        writer.writerow({
            "Filename": filename,
            "FullName": full_name,
            "Name": data["name"],
            "Email": data["email"],
            "Mobile Number": data["phone"],
            "Skills": ", ".join(data["skills"]),
            "College Name": data["university"],
            "Degree": data["degree"],
            "Designation": data["designition"],
            "Company Names": ", ".join(data["Companies worked at"]),
            "Total Experience": data["total_exp"]
        })

def process_resume_folder(folder_path, output_csv_file, num_resumes=15):
    count = 0
    for file_name in os.listdir(folder_path):
        if count >= num_resumes:
            break

        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            try:
                resume_info = resumeparse.read_file(file_path)
                save_to_csv(output_csv_file, resume_info, file_name)
                count += 1
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

# Example usage for a folder containing resume files
resume_folder_path = "/content/drive/MyDrive/Resumes/"
output_csv_file = "resume_data.csv"

# Process the first 15 resumes in the folder and append information to the existing CSV file
process_resume_folder(resume_folder_path, output_csv_file, num_resumes=15)

print(f"Data appended to {output_csv_file}")


2023-12-04 05:50:05,841 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2023-12-04 05:50:06,072 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2023-12-04 05:50:06,304 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
ERROR:root:invalid literal for int() with base 10: 'date'
ERROR:root:'NoneType' object has no attribute 'group'
ERROR:root:'NoneType' o

Data appended to resume_data.csv


## Entire dataset

In [None]:
import os
import csv
from resume_parser import resumeparse

def extract_name_from_filename(filename):
    return os.path.splitext(filename)[0]

def save_to_csv(output_file, data, filename):
    fieldnames = ["Filename", "FullName", "Name", "Email", "Mobile Number", "Skills", "College Name", "Degree", "Designation", "Company Names", "Total Experience"]

    # Check if the file already exists
    file_exists = os.path.isfile(output_file)

    with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # If the file doesn't exist, write the header
        if not file_exists:
            writer.writeheader()

        # Extract name from the filename
        full_name = extract_name_from_filename(filename)

        # Write data
        writer.writerow({
            "Filename": filename,
            "FullName": full_name,
            "Name": data["name"],
            "Email": data["email"],
            "Mobile Number": data["phone"],
            "Skills": ", ".join(data["skills"]),
            "College Name": data["university"],
            "Degree": data["degree"],
            "Designation": data["designition"],
            "Company Names": ", ".join(data["Companies worked at"]),
            "Total Experience": data["total_exp"]
        })

def process_resume_folder(folder_path, output_csv_file):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            try:
                resume_info = resumeparse.read_file(file_path)
                save_to_csv(output_csv_file, resume_info, file_name)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

# Example usage for a folder containing resume files
resume_folder_path = "/content/drive/MyDrive/Resumes/"
output_csv_file = "all_resume_data.csv"

# Process all resumes in the folder and append information to the existing CSV file
process_resume_folder(resume_folder_path, output_csv_file)

print(f"Data appended to {output_csv_file}")

In [None]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.3.7-py3-none-any.whl (221 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/221.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/221.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-p

# **Using OpenAI to generate tags**

In [None]:
import os
import pandas as pd
from openai import OpenAI

# Input your OpenAI API key here
openai_api_key = "YOUR API KEY HERE"

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/resume_data.csv')

# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai_api_key)

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Extract the content from the 'Skills' column
    user_content = row['Skills']

    # Create a chat completion
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a recruitment assistant"},
            {"role": "user", "content": f"Generate skills tags from this content I provide, please remove extra spaces and unnecessary tags, only useful tags should be: {user_content}"}
        ]
    )

    # Access the generated content
    generated_content = completion.choices[0].message.content

    # Set the generated content in the 'generated_tags' column for the current row
    df.loc[index, 'generated_tags'] = generated_content

# Save the updated DataFrame back to the CSV file
df.to_csv('/content/resume_data.csv', index=False)


# **Converting the output csv file to json file**

In [None]:
import pandas as pd

# Load the final output CSV file into a DataFrame
df = pd.read_csv('/content/resume_data.csv')

# Convert the DataFrame to JSON format with indentation
json_output = df.to_json(orient='records', indent=2)  # Adjust the indentation level as needed

# Save the JSON output to a file
output_json_file = '/content/resume_data.json'
with open(output_json_file, 'w') as json_file:
    json_file.write(json_output)

print(f"JSON data saved to {output_json_file}")


JSON data saved to /content/resume_data.json
