In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
import pandas as pd
from collections import Counter
from xgboost import XGBClassifier
from sklearn.utils import shuffle
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
import pandas as pd
import re
import string
import logging
import sys

In [None]:
MODEL_OUT_PATH = 'trained_model.pkl'

In [None]:
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [None]:
def clean_resume(resume_text):
    resume_text = re.sub(
        r"http\S+|#\S+|@\S+|[%s]" % re.escape(string.punctuation),
        " ",
        resume_text,
        flags=re.IGNORECASE,
    )

    resume_text = re.sub(r"[^\x00-\x7f]", " ", resume_text)
    resume_text = re.sub("\s+", " ", resume_text)
    return resume_text

In [None]:
def train_resume_model(resumes_df: list):




    cleaned_resumes = resumes_df['Resume_str'].apply(clean_resume)
    y = resumes_df['Category']


    vectorizer = CountVectorizer(stop_words="english")
    x = vectorizer.fit_transform(cleaned_resumes)



    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)



    clf_1 = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)
    clf_2 = AdaBoostClassifier(random_state=1, n_estimators=100, learning_rate=1)
    clf_3 = XGBClassifier(n_estimators=196, max_depth=4, learning_rate=0.03)
    clf_4 = LogisticRegression(max_iter=1000)
    clf_5 = DecisionTreeClassifier()


    ensemble = VotingClassifier(estimators=[
        ('rfc', clf_1),
        ('abc', clf_2),
        ('xbc', clf_3),
        ('lrc', clf_4),
        ('dcc', clf_5)
    ], voting='soft')


    ensemble.fit(x_train, y_train)


    with open(MODEL_OUT_PATH, "wb") as f:
        pickle.dump((ensemble, vectorizer), f)


    predictions = ensemble.predict(x_test)


    classification_report = metrics.classification_report(y_test, predictions, zero_division=1, output_dict=True)

    logging.info("Training of model has finished")

    return classification_report

In [None]:
df = pd.read_csv('/content/Resume.zip', encoding='utf-8')
df = shuffle(df)
trained_model = train_resume_model(df)



In [32]:
accuracy = trained_model['accuracy']
precision = trained_model["weighted avg"]["precision"]
recall = trained_model["weighted avg"]["recall"]

assert accuracy and precision and recall > 0.7


In [38]:
import pandas as pd
import pickle

def classify_resumes(resumes_df: pd.DataFrame) -> pd.DataFrame:
    with open(MODEL_OUT_PATH, "rb") as f:
        model, vectorizer = pickle.load(f)

    categories = model.classes_
    x = vectorizer.transform(resumes_df["Resume_str"].apply(clean_resume))
    predictions = model.predict_proba(x)
    predicted_categories = model.predict(x)

    # List to hold new rows
    prediction_data = []

    for resume_id, probas, pred_cat in zip(resumes_df["ID"], predictions, predicted_categories):
        top_3_categories = probas.argsort()[-3:][::-1]
        top_3_probas = probas[top_3_categories]

        # Append the new row data as a dictionary
        prediction_data.append({
            "ID": resume_id,
            "category_1": categories[top_3_categories[0]],
            "proba_1": top_3_probas[0],
            "category_2": categories[top_3_categories[1]],
            "proba_2": top_3_probas[1],
            "category_3": categories[top_3_categories[2]],
            "proba_3": top_3_probas[2],
        })

    # Create a DataFrame from the list of dictionaries
    prediction = pd.DataFrame(prediction_data)

    return prediction


In [39]:
classified_resumes = classify_resumes(df)
print(classified_resumes.describe())


                 ID      proba_1      proba_2      proba_3
count  2.484000e+03  2484.000000  2484.000000  2484.000000
mean   3.182616e+07     0.644028     0.052673     0.028451
std    2.145735e+07     0.107131     0.055247     0.019683
min    3.547447e+06     0.211388     0.014404     0.013101
25%    1.754430e+07     0.636853     0.025026     0.019718
50%    2.521031e+07     0.675584     0.033092     0.023185
75%    3.611444e+07     0.701757     0.050650     0.028731
max    9.980612e+07     0.850151     0.337507     0.218359


In [40]:
def add_original_category_and_fp(resumes):


    categories = df[['ID', 'Category']]

    resumes = resumes.merge(categories, on='ID', how='left')
    resumes = resumes.assign(fp=False)
    for index, row in resumes.iterrows():
        if not row['Category'] == row['category_1']:
            resumes.at[index, 'fp'] = True
    return resumes

classified_resumes = add_original_category_and_fp(classified_resumes)


false_positives = classified_resumes[classified_resumes['fp'] == True]


print(f"The amount of false positives (fp) is {false_positives.count()['ID']}")
print(f"The percentage of false positives (fp) is: {false_positives.count()['ID'] / classified_resumes.count()['ID'] * 100} %")

false_positives.describe()

The amount of false positives (fp) is 134
The percentage of false positives (fp) is: 5.394524959742351 %


Unnamed: 0,ID,proba_1,proba_2,proba_3
count,134.0,134.0,134.0,134.0
mean,29577510.0,0.370729,0.172457,0.072825
std,21259020.0,0.134713,0.07856,0.044874
min,10265060.0,0.211388,0.03033,0.017428
25%,15522300.0,0.245629,0.103432,0.038378
50%,23410150.0,0.341879,0.183122,0.058666
75%,31898600.0,0.440122,0.22981,0.095765
max,99561380.0,0.689139,0.337507,0.218359
