<a href="https://colab.research.google.com/github/ananyavrm04/AI-Resume-Matcher-With-Feedback/blob/main/resume_matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
#data collected

import pandas as pd

# Load the dataset (make sure you uploaded it in Colab)
try:
    df = pd.read_csv('/content/UpdatedResumeDataSetnew.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('/content/UpdatedResumeDataSetnew.csv', encoding='latin1')


# View basic info
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)

# Preview first few rows
df.head()

Shape: (962, 4)

Columns:
 Index(['Category', 'Resume', 'Unnamed: 2', 'Unnamed: 3'], dtype='object')


Unnamed: 0,Category,Resume,Unnamed: 2,Unnamed: 3
0,Data Science,Skills * Programming Languages: Python (pandas...,,
1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...,,
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",,
3,Data Science,Skills â?¢ R â?¢ Python â?¢ SAP HANA â?¢ Table...,,
4,Data Science,"Education Details \n MCA YMCAUST, Faridabad...",,


In [20]:
#data cleaning

df = df[['Category','Resume']]  #keeping only relevant resume
df.dropna(inplace = True)
df.reset_index(drop=True, inplace = True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â?¢ R â?¢ Python â?¢ SAP HANA â?¢ Table...
4,Data Science,"Education Details \n MCA YMCAUST, Faridabad..."


In [21]:
#text preprocessing

import re  #regular expression library to search for patterns in text for removing punctuations ,special characters or matching specific words
import nltk #NL toolkit for text processing like tokenization , lemmatization , removing stopwords
nltk.download('stopwords')  #removing is , the , in , etc
nltk.download('punkt')      #tokenizer , breaking setences into words
nltk.download('wordnet')    #huge dict of english words with meaning , forms , etc.....needed in lemmatization where we reduce words to their root form
nltk.download('punkt_tab') # Download the missing punkt_tab resource

from nltk.corpus import stopwords     #using list of stop words
from nltk.tokenize import word_tokenize   #breaking text to words
from nltk.stem import WordNetLemmatizer  #converting words to root form #running - run

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [22]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^a-zA-Z]',' ',text)
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in stop_words and len(word)>2]
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return ''.join(tokens)

In [24]:
df['Cleaned_Resume'] = df['Resume'].apply(preprocess_text)

In [25]:
df[['Resume','Cleaned_Resume']].head()

Unnamed: 0,Resume,Cleaned_Resume
0,Skills * Programming Languages: Python (pandas...,skillprogramminglanguagepythonpandanumpyscipys...
1,Education Details \nMay 2013 to May 2017 B.E ...,educationdetailmaymayuitrgpvdatascientistdatas...
2,"Areas of Interest Deep Learning, Control Syste...",areainterestdeeplearningcontrolsystemdesignpro...
3,Skills â?¢ R â?¢ Python â?¢ SAP HANA â?¢ Table...,skillpythonsaphanatableausaphanasqlsaphanapals...
4,"Education Details \n MCA YMCAUST, Faridabad...",educationdetailmcaymcaustfaridabadharyanadatas...


In [30]:
#cleaned text to numerical features for ML or DL model

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 3000)
X = tfidf.fit_transform(df['Cleaned_Resume']).toarray()

In [33]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['Category'])

In [35]:
# Training SVM model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)
model = SVC(kernel='linear')   #linear kernel works best for text
model.fit(X_train , y_train)
y_pred = model.predict(X_test)

print('Accuracy is : ', accuracy_score(y_test,y_pred))
print("\nClassification Report:\n" , classification_report(y_test,y_pred))

Accuracy is :  0.9844559585492227

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00         4
           5       1.00      1.00      1.00         9
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         8
           8       1.00      0.93      0.96        14
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         7
          11       1.00      1.00      1.00         6
          12       1.00      1.00      1.00        12
          13       1.00      1.00      1.00         4
          14       1.00      1.00      1.00         7
          15       0.83      1.00      0.91        15
          16       1.0

In [44]:
#simulation of feedback logging system
feedback_log = pd.DataFrame(columns = ['resume_text','predicted_category','corrected_category'])
resume_text = 'Experienced in building ML models and data analysis using Python'
predicted_category = 'HR'
corrected_category = 'Data Science'   #User corrected it manually

feedback_log.loc[len(feedback_log)] = {
    'resume_text': resume_text,
    'predicted_category': predicted_category,
    'corrected_category': corrected_category
}

In [45]:
print(feedback_log)

                                         resume_text predicted_category  \
0  Experienced in building ML models and data ana...                 HR   

  corrected_category  
0       Data Science  


In [47]:
#simulating multiple feedback entries

new_feedbacks = [{
    'resume_text' : 'Proficient in Java , REST APIs , and Spring Boot',
    'predicted_category' : 'Data Science',
    'corrected_category' : 'Software Engineer'
},
{
     'resume_text' : 'Created UI using Figma and Adobe XD',
     'predicted_category' : 'Software Engineer',
     'corrected_category' : 'Design'
     },
                 {
                     'resume_text' : 'Handled large datasets using SQL and Tableau',
                     'predicted_category' : 'HR',
                     'corrected_category' : 'Data Analyst'
                 }]

for feedback in new_feedbacks:
  feedback_log.loc[len(feedback_log)] = feedback

print(feedback_log)

                                         resume_text predicted_category  \
0  Experienced in building ML models and data ana...                 HR   
1   Proficient in Java , REST APIs , and Spring Boot       Data Science   
2                Created UI using Figma and Adobe XD  Software Engineer   
3       Handled large datasets using SQL and Tableau                 HR   

  corrected_category  
0       Data Science  
1  Software Engineer  
2             Design  
3       Data Analyst  


In [48]:
feedback_log.to_csv('feedback_log.csv',index = False)  # for future model retraining