In [None]:
## Loading data
import pandas as pd

file_path = "../../model/data/raw_data.csv"
df = pd.read_csv(file_path)
print(df.columns)

In [None]:
## Selecting columns
df = df[["Resume_str", "Category"]]
df.head()

In [None]:
## Filtering records
sectors_to_keep = ["INFORMATION-TECHNOLOGY", "HEALTHCARE", "CONSTRUCTION"]
filtered_df = df[df["Category"].isin(sectors_to_keep)]
filtered_df.head()


In [None]:
## Analyzing category distribution
category_counts = filtered_df["Category"].value_counts()
print("length of dataset", len(filtered_df))
print(category_counts)

In [None]:
## Undersampeling category distribution
min_size = int(category_counts.min())
sampled_filtered_df = pd.concat([
    filtered_df[filtered_df['Category'] == category].sample(min_size, random_state=100)
    for category in category_counts.index
])

sampled_category_counts = sampled_filtered_df['Category'].value_counts()
print("length of dataset", len(sampled_filtered_df))
print(sampled_category_counts)

In [None]:
## Clean 'Resume_str' fields

print(sampled_filtered_df["Resume_str"].iloc[0])

sampled_filtered_df.loc[:, "Resume_str"] = ( ## .loc[<row_indexer>, <column_indexer>]
    sampled_filtered_df["Resume_str"]
    .str.replace(r"[^a-zA-Z\s\.]", "", regex=True) ## Everything that is not a lowercase letter or space
    .str.replace(r"\s+", " ", regex=True) ## Everything that is 1 or more spaces
    .str.strip()
    .str.lower()
    )

print(sampled_filtered_df["Resume_str"].iloc[0])

In [7]:
## Saving raw data to test_data.csv
filtered_df.to_csv("test_data.csv")