In [None]:
## Loading data
import pandas as pd
import os

df = pd.read_csv(os.path.join("raw_data.csv"))
print(df.columns)

In [None]:
## Selecting columns
df = df[["Resume_str", "Category"]]
df.head()

In [None]:
df["Category"].value_counts()

In [None]:
## Filtering records
sectors_to_keep = ["INFORMATION-TECHNOLOGY", "HEALTHCARE", "CONSTRUCTION"]
filtered_df = df[df["Category"].isin(sectors_to_keep)]
filtered_df.head()


In [None]:
## Lowercasing category and analyzing category distribution
filtered_df.loc[:, "Category"] = filtered_df.loc[:, "Category"].str.lower().str.replace("-", " ")
category_counts = filtered_df["Category"].value_counts()
print("length of dataset", len(filtered_df))
print(category_counts)

In [None]:
## Undersampeling category distribution
min_size = int(category_counts.min())
sampled_filtered_df = pd.concat([
    filtered_df[filtered_df['Category'] == category].sample(min_size, random_state=100)
    for category in category_counts.index
])

sampled_category_counts = sampled_filtered_df['Category'].value_counts()
print("length of dataset", len(sampled_filtered_df))
print(sampled_category_counts)

In [None]:
## Simplify 'Resume_str' structure

print(sampled_filtered_df["Resume_str"].iloc[0] + "\n")

sampled_filtered_df.loc[:, "Resume_str"] = ( ## .loc[<row_indexer>, <column_indexer>]
    sampled_filtered_df["Resume_str"]
    .str.replace(r"[^a-zA-Z\s\.]", "", regex=True) ## Everything that is not a lowercase letter or space
    .str.replace(r"\s+", " ", regex=True) ## Everything that is 1 or more spaces
    .str.strip()
    .str.lower()
    )

print(sampled_filtered_df["Resume_str"].iloc[0])

In [None]:
## Neutralizing cv's by removing unwanted words (too specific or too unspecific)
replacement_mapping = {
    "information technology" : "",
    "healthcare" : "",
    "construction" : "",
    "summary" : "",
    "epic" : "",
    "good" : "",
    "specialist" : "",
    "professional" : ""
}

def burn(text:str) -> str:
    for old, new in replacement_mapping.items():
        text = text.replace(old, new)
    return text

sampled_filtered_df["Resume_str"] = sampled_filtered_df["Resume_str"].apply(lambda x: burn(x))
sampled_filtered_df["Resume_str"] = sampled_filtered_df["Resume_str"].str.replace(r"\s+", " ", regex=True).str.strip()

print(sampled_filtered_df["Resume_str"].iloc[0])

In [None]:
## View result of preprocessing
sampled_filtered_df.rename(columns={"Category":"Label"}, inplace=True)
sampled_filtered_df.reset_index(inplace=True, drop=True)
print("length of dataset", len(sampled_filtered_df))
sampled_filtered_df.sample(20).head(20)


In [None]:
## Saving raw data to test_data.csv
sampled_filtered_df.to_csv(os.path.join("test_data.csv"), index=False)