# Resume Classification

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Collection

In [None]:
df=pd.read_csv('C:/Users/kumar/OneDrive/Desktop/Machine Learning Project/datasets/resume classification.csv')
df.shape

(200, 10)

In [3]:
df.head()

Unnamed: 0,candidate_id,degree,years_experience,projects_count,certifications,skills_count,internship,github_portfolio,resume_score,interview_calls
0,C001,BTech,0,11,2,11,Yes,Yes,89,6
1,C002,BTech,8,1,4,17,Yes,Yes,100,6
2,C003,BSc,3,8,4,4,Yes,No,89,4
3,C004,MCA,4,12,6,4,Yes,No,100,7
4,C005,MTech,2,3,6,14,Yes,Yes,96,6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   candidate_id      200 non-null    object
 1   degree            200 non-null    object
 2   years_experience  200 non-null    int64 
 3   projects_count    200 non-null    int64 
 4   certifications    200 non-null    int64 
 5   skills_count      200 non-null    int64 
 6   internship        200 non-null    object
 7   github_portfolio  200 non-null    object
 8   resume_score      200 non-null    int64 
 9   interview_calls   200 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 15.8+ KB


In [5]:
df.describe()

Unnamed: 0,years_experience,projects_count,certifications,skills_count,resume_score,interview_calls
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,4.12,6.14,2.94,11.38,90.565,5.795
std,2.677958,3.571298,1.963588,4.794846,10.733209,1.611197
min,0.0,0.0,0.0,4.0,54.0,0.0
25%,2.0,3.0,1.0,7.0,84.0,5.0
50%,4.0,6.0,3.0,11.0,94.0,6.0
75%,6.0,9.0,5.0,15.0,100.0,7.0
max,8.0,12.0,6.0,20.0,100.0,8.0


## Prepare Target Variable




In [6]:
df['interview_calls_category'] = np.where(df['interview_calls'] >= 6, 'High Interview Calls', 'Low Interview Calls')
df.head()

Unnamed: 0,candidate_id,degree,years_experience,projects_count,certifications,skills_count,internship,github_portfolio,resume_score,interview_calls,interview_calls_category
0,C001,BTech,0,11,2,11,Yes,Yes,89,6,High Interview Calls
1,C002,BTech,8,1,4,17,Yes,Yes,100,6,High Interview Calls
2,C003,BSc,3,8,4,4,Yes,No,89,4,Low Interview Calls
3,C004,MCA,4,12,6,4,Yes,No,100,7,High Interview Calls
4,C005,MTech,2,3,6,14,Yes,Yes,96,6,High Interview Calls


## Encode Categorical Features



In [7]:
categorical_cols = ['degree', 'internship', 'github_portfolio']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

df_encoded = df_encoded.drop(columns=['candidate_id', 'interview_calls'])

print("DataFrame after one-hot encoding and dropping columns:")
df_encoded.head()

DataFrame after one-hot encoding and dropping columns:


Unnamed: 0,years_experience,projects_count,certifications,skills_count,resume_score,interview_calls_category,degree_BTech,degree_MCA,degree_MTech,internship_Yes,github_portfolio_Yes
0,0,11,2,11,89,High Interview Calls,True,False,False,True,True
1,8,1,4,17,100,High Interview Calls,True,False,False,True,True
2,3,8,4,4,89,Low Interview Calls,False,False,False,True,False
3,4,12,6,4,100,High Interview Calls,False,True,False,True,False
4,2,3,6,14,96,High Interview Calls,False,False,True,True,True


In [8]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   years_experience          200 non-null    int64 
 1   projects_count            200 non-null    int64 
 2   certifications            200 non-null    int64 
 3   skills_count              200 non-null    int64 
 4   resume_score              200 non-null    int64 
 5   interview_calls_category  200 non-null    object
 6   degree_BTech              200 non-null    bool  
 7   degree_MCA                200 non-null    bool  
 8   degree_MTech              200 non-null    bool  
 9   internship_Yes            200 non-null    bool  
 10  github_portfolio_Yes      200 non-null    bool  
dtypes: bool(5), int64(5), object(1)
memory usage: 10.5+ KB


## Split Data




In [9]:
X = df_encoded.drop('interview_calls_category', axis=1)
y = df_encoded['interview_calls_category']

# Convert boolean columns to integers (0 or 1)
for col in X.select_dtypes(include='bool').columns:
    X[col] = X[col].astype(int)

print("Features (X) head:")
print(X.head())
print("\nTarget (y) head:")
print(y.head())

Features (X) head:
   years_experience  projects_count  certifications  skills_count  \
0                 0              11               2            11   
1                 8               1               4            17   
2                 3               8               4             4   
3                 4              12               6             4   
4                 2               3               6            14   

   resume_score  degree_BTech  degree_MCA  degree_MTech  internship_Yes  \
0            89             1           0             0               1   
1           100             1           0             0               1   
2            89             0           0             0               1   
3           100             0           1             0               1   
4            96             0           0             1               1   

   github_portfolio_Yes  
0                     1  
1                     1  
2                     0  
3          

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (160, 10)
X_test shape: (40, 10)
y_train shape: (160,)
y_test shape: (40,)


## Train Classification Model


In [11]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42, solver='liblinear') # Added solver for older sklearn versions

# Fit the model to the training data
log_reg_model.fit(X_train, y_train)

print("Logistic Regression model trained successfully.")

Logistic Regression model trained successfully.


## Evaluate Model Performance



In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# 1. Make predictions on the X_test dataset
y_pred = log_reg_model.predict(X_test)

# 3. Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# 4. Calculate and print the precision score
# For binary classification with string labels, specify pos_label
precision = precision_score(y_test, y_pred, pos_label='High Interview Calls')
print(f"Precision (High Interview Calls): {precision:.2f}")

# 5. Calculate and print the recall score
# For binary classification with string labels, specify pos_label
recall = recall_score(y_test, y_pred, pos_label='High Interview Calls')
print(f"Recall (High Interview Calls): {recall:.2f}")

Accuracy: 0.88
Precision (High Interview Calls): 0.88
Recall (High Interview Calls): 0.92
