In [1]:
## Loading data
import pandas as pd

file_path = "../../model/data/raw_data.csv"
df = pd.read_csv(file_path)
print(df.columns)

Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')


In [2]:
## Selecting columns
df = df[["Resume_str", "Category"]]
df.head()

Unnamed: 0,Resume_str,Category
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR


In [3]:
## Filtering records
sectors_to_keep = ["INFORMATION-TECHNOLOGY", "HEALTHCARE", "CONSTRUCTION"]
filtered_df = df[df["Category"].isin(sectors_to_keep)]
filtered_df.head()


Unnamed: 0,Resume_str,Category
217,INFORMATION TECHNOLOGY Summar...,INFORMATION-TECHNOLOGY
218,INFORMATION TECHNOLOGY SPECIALIST\tGS...,INFORMATION-TECHNOLOGY
219,INFORMATION TECHNOLOGY SUPERVISOR ...,INFORMATION-TECHNOLOGY
220,INFORMATION TECHNOLOGY INSTRUCTOR ...,INFORMATION-TECHNOLOGY
221,INFORMATION TECHNOLOGY MANAGER/ANALYS...,INFORMATION-TECHNOLOGY


In [4]:
## Analyzing category distribution
category_counts = filtered_df["Category"].value_counts()
print("length of dataset", len(filtered_df))
print(category_counts)

length of dataset 347
Category
INFORMATION-TECHNOLOGY    120
HEALTHCARE                115
CONSTRUCTION              112
Name: count, dtype: int64


In [5]:
## Undersampeling category distribution
min_size = int(category_counts.min())
sampled_filtered_df = pd.concat([
    filtered_df[filtered_df['Category'] == category].sample(min_size, random_state=100)
    for category in category_counts.index
])

sampled_category_counts = sampled_filtered_df['Category'].value_counts()
print("length of dataset", len(sampled_filtered_df))
print(sampled_category_counts)

length of dataset 336
Category
INFORMATION-TECHNOLOGY    112
HEALTHCARE                112
CONSTRUCTION              112
Name: count, dtype: int64


In [6]:
## Clean 'Resume_str' fields

print(sampled_filtered_df["Resume_str"].iloc[0])

sampled_filtered_df.loc[:, "Resume_str"] = ( ## .loc[<row_indexer>, <column_indexer>]
    sampled_filtered_df["Resume_str"]
    .str.replace(r"[^a-zA-Z\s\.]", "", regex=True) ## Everything that is not a lowercase letter or space
    .str.replace(r"\s+", " ", regex=True) ## Everything that is 1 or more spaces
    .str.strip()
    .str.lower()
    )

print(sampled_filtered_df["Resume_str"].iloc[0])

         INFORMATION TECHNOLOGY CONSULTANT, MANAGING MEMBER           Summary    A versatile, analytic IT Specialist with a proven record of success within large institutions as well as entrepreneurial organizations. Thrives on challenge and solves problems with creativity and persistence. A data-driven team leader skilled in both producing and communicating results.      Experience      Information Technology Consultant, Managing Member    January 2017   to   Current     Company Name   －   City  ,   State       Worked with product designers and product managers to design user interactions in applications.    Envisioned inspired new products, features and flows.   Answered user inquiries regarding computer software and hardware operation.    Installed and performed repairs to hardware, software and peripheral equipment, following design and installation specifications.  Set up equipment for employee use.   Conducted computer diagnostics to investigate and resolve problems and provide t

In [7]:
## Saving raw data to test_data.csv
filtered_df.to_csv("test_data.csv")