## Data cleaning and processing

### Importing packages

In [19]:
import pandas as pd
import re

### Downloading datasets

In [None]:
# This is JD and not resumes. DK if we should use???
df1 = pd.read_csv('data/job_title_des.csv')

print(df1.head())
print (df1['Job Title'].unique())

   Unnamed: 0             Job Title  \
0           0     Flutter Developer   
1           1      Django Developer   
2           2      Machine Learning   
3           3         iOS Developer   
4           4  Full Stack Developer   

                                     Job Description  
0  We are looking for hire experts flutter develo...  
1  PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...  
2  Data Scientist (Contractor)\n\nBangalore, IN\n...  
3  JOB DESCRIPTION:\n\nStrong framework outside o...  
4  job responsibility full stack engineer – react...  
['Flutter Developer' 'Django Developer' 'Machine Learning' 'iOS Developer'
 'Full Stack Developer' 'Java Developer' 'JavaScript Developer'
 'DevOps Engineer' 'Software Engineer' 'Database Administrator'
 'Wordpress Developer' 'PHP Developer' 'Backend Developer'
 'Network Administrator' 'Node js developer']


In [13]:
import pandas as pd

df2 = pd.read_csv('data/Preprocessed_Data.txt', sep=',')  # or sep=',' if comma-separated

print(df2.head())
print (df2['Category'].unique())

     Category                                               Text
0  Accountant  education omba executive leadership university...
1  Accountant  howard gerrard accountant deyjobcom birmingham...
2  Accountant  kevin frank senior accountant inforesumekraftc...
3  Accountant  place birth nationality olivia ogilvy accounta...
4  Accountant  stephen greet cpa senior accountant 9 year exp...
['Accountant' 'Advocate' 'Agriculture' 'Apparel' 'Architecture' 'Arts'
 'Automobile' 'Aviation' 'Banking' 'Blockchain' 'BPO'
 'Building and Construction' 'Business Analyst' 'Civil Engineer'
 'Consultant' 'Data Science' 'Database' 'Designing' 'DevOps'
 'Digital Media' 'DotNet Developer' 'Education' 'Electrical Engineering'
 'ETL Developer' 'Finance' 'Food and Beverages' 'Health and Fitness'
 'Human Resources' 'Information Technology' 'Java Developer' 'Management'
 'Mechanical Engineer' 'Network Security Engineer' 'Operations Manager'
 'PMO' 'Public Relations' 'Python Developer' 'React Developer' 'Sales'


In [12]:
df3 = pd.read_csv('data/UpdatedResumeDataSet.csv',encoding='utf-8')

print(df3.head())
print (df3['Category'].unique())

       Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2  Data Science  Areas of Interest Deep Learning, Control Syste...
3  Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4  Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...
['Data Science' 'HR' 'Advocate' 'Arts' 'Web Designing'
 'Mechanical Engineer' 'Sales' 'Health and fitness' 'Civil Engineer'
 'Java Developer' 'Business Analyst' 'SAP Developer' 'Automation Testing'
 'Electrical Engineering' 'Operations Manager' 'Python Developer'
 'DevOps Engineer' 'Network Security Engineer' 'PMO' 'Database' 'Hadoop'
 'ETL Developer' 'DotNet Developer' 'Blockchain' 'Testing']


In [24]:
def preprocess_resume(text):
    """
    Cleans raw resume text by removing URLs, mentions, punctuation,
    and non-ASCII characters, while standardizing whitespace.
    """
    # Remove web links
    text = re.sub(r'http\S+', ' ', text)
    
    # Remove 'RT' (retweet) and 'cc' tokens
    text = re.sub(r'\b(RT|cc)\b', ' ', text)
    
    # Remove hashtags and mentions
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)
    
    # Remove punctuation and special characters
    text = re.sub(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]", ' ', text)
    
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Replace multiple spaces/newlines with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


df2 = df2.rename(columns={
    'Text': 'Resume'
})
combined_df = pd.concat([df2, df3], ignore_index=True)

# Remove duplicate resumes and reset index
combined_df = combined_df.drop_duplicates(subset='Resume').reset_index(drop=True)

# Apply cleaning
combined_df['clean_text'] = combined_df['Resume'].apply(preprocess_resume)

print(combined_df.head())
print (combined_df['Category'].unique())

     Category                                             Resume  \
0  Accountant  education omba executive leadership university...   
1  Accountant  howard gerrard accountant deyjobcom birmingham...   
2  Accountant  kevin frank senior accountant inforesumekraftc...   
3  Accountant  place birth nationality olivia ogilvy accounta...   
4  Accountant  stephen greet cpa senior accountant 9 year exp...   

                                          clean_text  
0  education omba executive leadership university...  
1  howard gerrard accountant deyjobcom birmingham...  
2  kevin frank senior accountant inforesumekraftc...  
3  place birth nationality olivia ogilvy accounta...  
4  stephen greet cpa senior accountant 9 year exp...  
['Accountant' 'Advocate' 'Agriculture' 'Apparel' 'Architecture' 'Arts'
 'Automobile' 'Aviation' 'Banking' 'Blockchain' 'BPO'
 'Building and Construction' 'Business Analyst' 'Civil Engineer'
 'Consultant' 'Data Science' 'Database' 'Designing' 'DevOps'
 'Digital 

In [25]:
combined_df.to_csv('data/clean_resume_dataset.csv', index=False)
