# Importing Dependencies


In [1]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV`a
from sklearn.metrics import accuracy_score,classification_report

# Reading the file

In [2]:
df=pd.read_csv("UpdatedResumeDataSet.csv")

In [3]:
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [4]:
df["Category"].value_counts() # we can see that the data is imbalanced this can effect the performance of the model.

Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: count, dtype: int64

In [5]:
df.isnull().sum()

Category    0
Resume      0
dtype: int64

# Resampling to Balance the Data 

In [6]:
df_new=[]

In [7]:
from sklearn.utils import resample
max_count=df["Category"].value_counts().max()
for category in df['Category'].unique():
    categorical_data=df[df['Category']==category]
    if len(categorical_data)<max_count:
        balanced_df=resample(categorical_data,n_samples=max_count,replace=True,random_state=42)
    else:
        balanced_df=resample(categorical_data,n_samples=max_count,replace=False,random_state=42)
    df_new.append(balanced_df)
    
df=pd.concat(df_new)

In [8]:
df["Category"].value_counts()

Category
Data Science                 84
Electrical Engineering       84
Blockchain                   84
DotNet Developer             84
ETL Developer                84
Hadoop                       84
Database                     84
PMO                          84
Network Security Engineer    84
DevOps Engineer              84
Python Developer             84
Operations Manager           84
Automation Testing           84
HR                           84
SAP Developer                84
Business Analyst             84
Java Developer               84
Civil Engineer               84
Health and fitness           84
Sales                        84
Mechanical Engineer          84
Web Designing                84
Arts                         84
Advocate                     84
Testing                      84
Name: count, dtype: int64

# Cleaning The Text Data

In [9]:
def cleaned_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    return text

In [10]:
df["Resume"]=df["Resume"].apply(lambda x:cleaned_text(x))

In [11]:
df

Unnamed: 0,Category,Resume
38,Data Science,personal skills â ability to quickly grasp tec...
28,Data Science,personal skills â ability to quickly grasp tec...
14,Data Science,education details mca ymcaust faridabad haryan...
7,Data Science,education details btech rayat and bahra instit...
20,Data Science,skills programming languages python pandas num...
...,...,...
899,Testing,â good logical and analytical skills â positiv...
926,Testing,skill set os windows xp database mysql sql ser...
926,Testing,skill set os windows xp database mysql sql ser...
924,Testing,personal skills â quick learner â eagerness to...


In [13]:
text='personal skills â\x9e¢ ability to quickly grasp technical aspects and willingness to learn â\x9e¢ high energy levels  result oriented. education details january 2018 master of engineering computer technology  application bhopal, madhya pradesh truba institute of engineering  information technologyjanuary 2010 b.e. computer science bhopal, madhya pradesh rkdf institute of science and technology college of engineeringjanuary 2006 polytechnic information technology vidisha, madhya pradesh sati engineering college in vidishajanuary 2003 m.tech thesis detail  bmch school in ganj basodadata science i have six month experience in data science. key skills - experience in machine learning, deep leaning, nlp, python, sql, web scraping good knowledge in computer subjects and ability to updateskill details experience in machine learning, deep learning, nlp, python, sql, web crawling, html,css.- exprience - less than 1 year monthscompany details company - rnt.ai technology solutiondescription - text classification using machine learning algorithms with python.practical knowledge of deep learning algorithms such as â\xa0recurrent neural networks(rnn).develop custom data models and algorithms to apply to datasetexperience with python packages like pandas, scikit-learn, tensor flow, numpy, matplotliv, nltk.comfort with sql, â\xa0mysqlsentiment analysis.â\xa0apply leave dataset using classification technique like tf--idf , lsa with cosine similarity using machine learning algorithms.web crawling using selenium web driver and beautiful soup with python.company - life insurance corporation of india bhopaldescription - ã¼â\xa0explaining policy features and the benefitsã¼ updated knowledge of life insurance products and shared with customers'

In [14]:
cleaned_text(text)# we can see the data is cleaned well.

'personal skills â ability to quickly grasp technical aspects and willingness to learn â high energy levels result oriented education details january master of engineering computer technology application bhopal madhya pradesh truba institute of engineering information technologyjanuary be computer science bhopal madhya pradesh rkdf institute of science and technology college of engineeringjanuary polytechnic information technology vidisha madhya pradesh sati engineering college in vidishajanuary mtech thesis detail bmch school in ganj basodadata science i have six month experience in data science key skills experience in machine learning deep leaning nlp python sql web scraping good knowledge in computer subjects and ability to updateskill details experience in machine learning deep learning nlp python sql web crawling htmlcss exprience less than year monthscompany details company rntai technology solutiondescription text classification using machine learning algorithms with pythonprac

# Vectorizing the text data using tfidf vectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
X=df["Resume"]
y=df["Category"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=32)

In [16]:
vect=TfidfVectorizer()
X_train=vect.fit_transform(X_train)
X_test=vect.transform(X_test)

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Training the Data with RF Classifier

In [18]:
rf=RandomForestClassifier()
model=rf.fit(X_train,y_train)
pred=model.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score,classification_report

In [20]:
print(accuracy_score(y_test,pred))

1.0


In [21]:
print(classification_report(y_test,pred))

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00        14
                     Arts       1.00      1.00      1.00        15
       Automation Testing       1.00      1.00      1.00        19
               Blockchain       1.00      1.00      1.00        28
         Business Analyst       1.00      1.00      1.00        12
           Civil Engineer       1.00      1.00      1.00        22
             Data Science       1.00      1.00      1.00        19
                 Database       1.00      1.00      1.00        11
          DevOps Engineer       1.00      1.00      1.00        18
         DotNet Developer       1.00      1.00      1.00         8
            ETL Developer       1.00      1.00      1.00        17
   Electrical Engineering       1.00      1.00      1.00        21
                       HR       1.00      1.00      1.00        18
                   Hadoop       1.00      1.00      1.00     

# Performing The Prediction On a Random Resume

In [27]:
test='''Professional Summary
Innovative and results-driven Data Scientist with expertise in applying advanced statistical techniques and machine learning algorithms, including
regression, classification, clustering, time series analysis, and natural language processing (NLP). Proficient in Python and R, with hands-on
experience in designing neural networks and deep learning models. Adept at leveraging data-driven insights to drive strategic decisions, with
strong communication, consulting, and project management skills. Holds a B.Tech in Mechatronics and a Post Graduation Diploma In Data
Science.
Experience
Wahy Lab Solutions Kochi
Data Scientist 12/2022 - 12/2023
• Analyzed large datasets to identify patterns and actionable insights that informed business strategy, improving decision-making .
• Developed and validated predictive models to forecast key business metrics, achieving 80% accuracy and providing valuable foresight
for strategic planning.
• Designed and conducted A/B tests to evaluate and optimize marketing and operational strategies, increasing conversion rates.
• Created and maintained interactive dashboards in Power BI to visualize key metrics, enabling stakeholders to monitor business
performance and insights in real-time.
• Documented data processes, model architecture, and code for transparency and to ensure reproducibility across teams.
Skill Vertex Bangalore
Data science Intern 06/2021 - 08/2021
• Utilized Python, SQL, and R to process and clean data from multiple sources, ensuring data quality and readiness for analysis.
• Built statistical and machine learning models to understand customer behavior, achieving improvement in targeted campaign
effectiveness.
• Collaborated with cross-functional teams to translate analytical insights into business strategies that improved customer engagement.
• Created visualizations using Tableau and matplotlib, simplifying complex findings for non-technical audiences.
Education
Prist University Chennai
Bachelor of Technology in Mechatronics 08/2018 - 06/2022
Ims Proschool Mumbai
Post Graduation Diploma in Data Science 09/2021 - 09/2022
Technical skills
• Data Visualization • Statistical Analytics
• Python • Scikit-learn
• R programming • Tableau
• TensorFlow • Microsoft azure
• Power BI • SQL
• OpenCV • Data Extraction
• Data Manipulation • Predictive Modeling
• Communication • Collaborative
• Critical Thinking
CERTIFICATIONS
PGDM IN DATASCIENCE
01/2023
DATA SCIENCE TRAINING
05/2021
PROJECTS
Sales Forecasting with Time Series Analysis
• Built and deployed a time series model to forecast monthly sales for a retail business, enabling data-driven inventory and pricing
decisions.
• Implemented feature engineering to extract trends, seasonality, and additional temporal features, improving model performance.
• Leveraged Python libraries like Prophet and ARIMA, achieving a 20% improvement in forecast accuracy over the previous model.
Document Digitization and Analysis with OCR and NLP
• Objective: Built a pipeline to digitize and analyze data from scanned documents (PDFs and images) using Optical Character Recognition
(OCR) and Natural Language Processing (NLP).
• Skills Used: Used Tesseract OCR to extract text from images and PDFs, then applied NLP techniques to categorize and summarize
information.
• Outcome: Created an organized and searchable database, making document retrieval faster and supporting business decision-making
processes.
Sales Enablement Dashboard
• Created a Power BI dashboard to visualize and track sales performance against targets, enhancing client reporting and driving data-based
decision-making.'''

In [29]:
test=cleaned_text(test)

In [33]:
test

'professional summary innovative and resultsdriven data scientist with expertise in applying advanced statistical techniques and machine learning algorithms including regression classification clustering time series analysis and natural language processing nlp proficient in python and r with handson experience in designing neural networks and deep learning models adept at leveraging datadriven insights to drive strategic decisions with strong communication consulting and project management skills holds a btech in mechatronics and a post graduation diploma in data science experience wahy lab solutions kochi data scientist analyzed large datasets to identify patterns and actionable insights that informed business strategy improving decisionmaking developed and validated predictive models to forecast key business metrics achieving accuracy and providing valuable foresight for strategic planning designed and conducted ab tests to evaluate and optimize marketing and operational strategies i

In [34]:
test=['professional summary innovative and resultsdriven data scientist with expertise in applying advanced statistical techniques and machine learning algorithms including regression classification clustering time series analysis and natural language processing nlp proficient in python and r with handson experience in designing neural networks and deep learning models adept at leveraging datadriven insights to drive strategic decisions with strong communication consulting and project management skills holds a btech in mechatronics and a post graduation diploma in data science experience wahy lab solutions kochi data scientist analyzed large datasets to identify patterns and actionable insights that informed business strategy improving decisionmaking developed and validated predictive models to forecast key business metrics achieving accuracy and providing valuable foresight for strategic planning designed and conducted ab tests to evaluate and optimize marketing and operational strategies increasing conversion rates created and maintained interactive dashboards in power bi to visualize key metrics enabling stakeholders to monitor business performance and insights in realtime documented data processes model architecture and code for transparency and to ensure reproducibility across teams skill vertex bangalore data science intern utilized python sql and r to process and clean data from multiple sources ensuring data quality and readiness for analysis built statistical and machine learning models to understand customer behavior achieving improvement in targeted campaign effectiveness collaborated with crossfunctional teams to translate analytical insights into business strategies that improved customer engagement created visualizations using tableau and matplotlib simplifying complex findings for nontechnical audiences education prist university chennai bachelor of technology in mechatronics ims proschool mumbai post graduation diploma in data science technical skills data visualization statistical analytics python scikitlearn r programming tableau tensorflow microsoft azure power bi sql opencv data extraction data manipulation predictive modeling communication collaborative critical thinking certifications pgdm in datascience data science training projects sales forecasting with time series analysis built and deployed a time series model to forecast monthly sales for a retail business enabling datadriven inventory and pricing decisions implemented feature engineering to extract trends seasonality and additional temporal features improving model performance leveraged python libraries like prophet and arima achieving a improvement in forecast accuracy over the previous model document digitization and analysis with ocr and nlp objective built a pipeline to digitize and analyze data from scanned documents pdfs and images using optical character recognition ocr and natural language processing nlp skills used used tesseract ocr to extract text from images and pdfs then applied nlp techniques to categorize and summarize information outcome created an organized and searchable database making document retrieval faster and supporting business decisionmaking processes sales enablement dashboard created a power bi dashboard to visualize and track sales performance against targets enhancing client reporting and driving databased decisionmaking']

In [35]:
test=vect.transform(test)

In [36]:
model.predict(test)

array(['Data Science'], dtype=object)

# Preforming a Grid Search to improve the model

In [44]:
 from sklearn.model_selection import GridSearchCV

In [45]:
param_grid = {
    'n_estimators': [10, 50, 100],    # Number of trees
    'max_depth': [None, 10, 20],      # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]     # Minimum samples per leaf
}

In [46]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,              # 5-fold cross-validation
    scoring='accuracy', # Metric for evaluation
    verbose=2,          # Verbosity level
    n_jobs=-1           # Use all available processors
)

In [47]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [48]:
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Best Cross-Validation Accuracy: 0.9994047619047619


In [49]:
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 1.0


# Downloading the model as a picke file

In [50]:
import joblib
joblib.dump(best_model,"resume screening model.pkl")

['resume screening model.pkl']

In [52]:
joblib.dump(vect,"tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [53]:
from joblib import load

# Load the model and vectorizer
model = load('resume screening model.pkl')
vectorizer = load('tfidf_vectorizer.pkl')

# Test with example text
example_text = ["Experienced data scientist with strong Python skills"]
features = vectorizer.transform(example_text)
print(model.predict(features)) 


['Data Science']
