In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [29]:
datasetPath = "../../datasets/UpdatedResumeDataSet.csv"

In [30]:
dataset = pd.read_csv(datasetPath)

In [31]:
dataset.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [32]:
dataset.describe()

Unnamed: 0,Category,Resume
count,962,962
unique,25,166
top,Java Developer,"Technical Skills Web Technologies: Angular JS,..."
freq,84,18


In [33]:
X = dataset['Resume'].values
y = dataset['Category'].values

In [34]:
type(X[0])

str

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

# Predict on the testing set
y_pred = rf_classifier.predict(X_test_tfidf)

In [37]:
import os

save_dir = '../../models/classification'

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# Define the filename for the saved model
model_filename = 'rf_classifier_model.pkl'

save_path = os.path.join(save_dir, model_filename)

joblib.dump(rf_classifier, save_path)

vectorizer_filename = 'tfidf_vectorizer.pkl'
vectorizer_path = os.path.join(save_dir, vectorizer_filename)
joblib.dump(vectorizer, vectorizer_path)


['../../models/classification/tfidf_vectorizer.pkl']

In [38]:
new_data = "Zia Ur Rehman \nMERN DEVELOPER \nziar4556@gmail.com \n03174557957 \nLahore,54000, Pakistan \nEDUCATION \nInter in Computer Science \nPunjab College of Information and Technology \n08/2018 - 04/2020\n, \n \nLahore,Pakistan \nBachelor In Computer Science \nUniversity of Management and Technology \n09/2020 - Present\n, \n \nlahore,Pakistan' 3.56 cgp \nGame Design & Development \\\nReplica of Hill Climb \nSoftware Engineering \\ Quick\nSolution (website) \nWORK EXPERIENCE \nIntern as Web Developer \nSmart Seo \n09/2022 - 02/2023\n, \n \nAccomplish Tasks\\ projects given by head \nLearn New Frameworks \nSKILLS \nTeamwork \nProject Leading \nCommunication Skills \nFrontEnd Developer \nGame Design and Development \nDatabase \nPython 3 Programming \nProgramming in C++ \nPERSONAL PROJECTS \nAuto Mobile Portfolio\n (07/2022 - Present)\n \nTensiz Game (React-js)\n (02/2023 - Present)\n \nCERTIFICATES \nCertiﬁcate of Participation (SOFTEC)\n (04/2023 - Present)\n \nParticipation in Programming Competition \nFront End Web Development\n (10/2022 - Present)\n \nCertiﬁcate by Pakistan freelancing training program in Batch 5\nand get A grade \nCertiﬁcate of Participation (GDSC)\n (01/2023 - Present)\n \nCertiﬁcate of Participation into \"The New era of Internet\" by\nGoogle Developer Student Club \nPython Programming\n (10/2022 - Present)\n \nHacker Rank \nLANGUAGES \nUrdu \nNative or Bilingual Proﬁciency \nEnglish \nProfessional Working Proﬁciency \nPunjabi \nNative or Bilingual Proﬁciency \nChinese \nElementary Proﬁciency \nProjects \nAchievements/Tasks"
saved_model = joblib.load(save_path)

# Assuming you have new textual data stored in a variable 'new_data'
new_data_tfidf = vectorizer.transform([new_data])

# Make predictions using the loaded model
predictions = saved_model.predict(new_data_tfidf)
predictions[0]

'Python Developer'

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9896373056994818
Classification Report:
                           precision    recall  f1-score   support

                 Advocate       1.00      0.33      0.50         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      1.00      1.00        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       0.86      1.00      0.92        12
         