In [None]:
all_upwork_jobs_2024-02-07-2024-03-24 (2)

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
print(" NLTK data downloaded successfully")

Downloading NLTK data...
 NLTK data downloaded successfully


In [3]:
# Load the data
print("Loading data...")
df = pd.read_csv('all_upwork_jobs_2024-02-07-2024-03-24 (2).csv')  
print(f"Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")

# Display basic info
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Loading data...
Data loaded successfully: 244828 rows, 8 columns

DATASET OVERVIEW
Shape: (244828, 8)
Columns: ['title', 'link', 'published_date', 'is_hourly', 'hourly_low', 'hourly_high', 'budget', 'country']
Memory usage: 93.19 MB


In [5]:
# Check missing values
print("\n" + "="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing %': missing_percent.values
})
print(missing_df)


MISSING VALUES ANALYSIS
           Column  Missing Count  Missing %
0           title              1   0.000408
1            link              1   0.000408
2  published_date              0   0.000000
3       is_hourly              0   0.000000
4      hourly_low         142406  58.165733
5     hourly_high         146053  59.655350
6          budget         140937  57.565720
7         country           5077   2.073701


In [9]:
# Data types
print("\n" + "="*50)
print("DATA TYPES")
print("="*50)
print(df.dtypes)




DATA TYPES
title              object
link               object
published_date     object
is_hourly            bool
hourly_low        float64
hourly_high       float64
budget            float64
country            object
dtype: object


In [15]:
print("\n" + "="*50)
print("STARTING DATA CLEANING PROCESS")
print("="*50)
# Step 1: Date cleaning
print("\n Step 1: Cleaning date columns...")
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
df['year'] = df['published_date'].dt.year
df['month'] = df['published_date'].dt.month
df['day_of_week'] = df['published_date'].dt.day_name()
df['hour'] = df['published_date'].dt.hour
df['day_of_month'] = df['published_date'].dt.day
print(" Date cleaning completed")

# Step 2: Numeric data cleaning
print("\n Step 2: Cleaning numeric columns...")

# Clean hourly rates
df['hourly_low'] = pd.to_numeric(df['hourly_low'], errors='coerce')
df['hourly_high'] = pd.to_numeric(df['hourly_high'], errors='coerce')
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
# Calculate average hourly rate
df['avg_hourly_rate'] = np.where(
    df['is_hourly'] == True,
    (df['hourly_low'].fillna(0) + df['hourly_high'].fillna(0)) / 2,
    np.nan
)

# Remove outliers (rates above $200/hour or budgets above $50000)
df.loc[df['avg_hourly_rate'] > 200, 'avg_hourly_rate'] = np.nan
df.loc[df['budget'] > 50000, 'budget'] = np.nan

print(" Numeric data cleaning completed")



STARTING DATA CLEANING PROCESS

 Step 1: Cleaning date columns...
 Date cleaning completed

 Step 2: Cleaning numeric columns...
 Numeric data cleaning completed


In [21]:
# Step 3: Text cleaning and preprocessing
print("\n Step 3 Text preprocessing and stopword removal...")

# Get English stopwords
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Additional stopwords specific to job postings
custom_stopwords = {
    'job', 'work', 'position', 'role', 'opportunity', 'candidate', 
    'experience', 'skill', 'requirement', 'looking', 'needed', 'want',
    'seeking', 'hiring', 'required', 'preferred', 'must', 'should',
    'will', 'able', 'new', 'project', 'company', 'team', 'client',
    'service', 'business', 'help', 'get', 'make', 'time', 'good',
    'one', 'two', 'first', 'last', 'also', 'well', 'way', 'come',
    'go', 'see', 'know', 'take', 'use', 'find', 'give', 'tell',
    'ask', 'seem', 'feel', 'try', 'leave', 'call', 'move', 'live',
    'believe', 'hold', 'bring', 'happen', 'write', 'sit', 'stand',
    'hear', 'let', 'begin', 'seem', 'turn', 'start', 'might', 'show',
    'every', 'great', 'small', 'public', 'able'
}
stop_words.update(custom_stopwords)
# Clean job titles
df['title_original'] = df['title'].copy()
df['title_clean'] = df['title'].str.lower()

# Remove special characters and numbers
df['title_clean'] = df['title_clean'].str.replace(r'[^a-zA-Z\s]', ' ', regex=True)
df['title_clean'] = df['title_clean'].str.replace(r'\s+', ' ', regex=True)
df['title_clean'] = df['title_clean'].str.strip()

# Tokenize and remove stopwords
df['title_tokens'] = df['title_clean'].apply(
    lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words and len(word) > 2]
)

# Apply stemming
df['title_stemmed'] = df['title_tokens'].apply(
    lambda x: [stemmer.stem(word) for word in x]
)

# Create processed title
df['title_processed'] = df['title_stemmed'].apply(lambda x: ' '.join(x))

print(" Text preprocessing completed")


 Step 3 Text preprocessing and stopword removal...
 Text preprocessing completed


In [43]:
# Step 4: DATA-DRIVEN CATEGORY DISCOVERY
print("\n Step 4: Discovering job categories from data...")

def extract_key_terms_from_titles():
    """Extract most common meaningful terms from job titles"""
    # Get all processed tokens
    all_tokens = []
    for tokens in df['title_stemmed']:
        all_tokens.extend(tokens)
    
    # Count frequency of terms
    term_counts = Counter(all_tokens)
    
    # Get most common terms (excluding very rare ones)
    min_frequency = max(2, len(df) // 100)  # At least 2 or 1% of dataset
    common_terms = {term: count for term, count in term_counts.items() 
                   if count >= min_frequency}
    
    return common_terms

def create_data_driven_categories():
    """Create categories based on actual data patterns"""
    
    # Extract key terms
    key_terms = extract_key_terms_from_titles()
    print(f"Found {len(key_terms)} key terms in job titles")
    
    # Display top terms
    top_terms = Counter(key_terms).most_common(20)
    print("\nTop 20 most frequent terms:")
    for term, count in top_terms:
        print(f"  {term}: {count} occurrences")
    
    # Create category mapping based on common patterns
    categories = {}
    
    # Development related terms
    dev_terms = [term for term, count in key_terms.items() 
                if any(keyword in term.lower() for keyword in 
                      ['develop', 'program', 'cod', 'app', 'web', 'mobil', 'softwar', 
                       'frontend', 'backend', 'fullstack', 'full', 'stack', 'engin'])]
    
    # Design related terms
    design_terms = [term for term, count in key_terms.items() 
                   if any(keyword in term.lower() for keyword in 
                         ['design', 'ui', 'ux', 'graphic', 'logo', 'brand', 'creativ', 'visual'])]
    
    # Data related terms
    data_terms = [term for term, count in key_terms.items() 
                 if any(keyword in term.lower() for keyword in 
                       ['data', 'analyst', 'analyt', 'scienc', 'research', 'databas', 'sql'])]
    
    # Marketing related terms
    marketing_terms = [term for term, count in key_terms.items() 
                      if any(keyword in term.lower() for keyword in 
                            ['market', 'seo', 'social', 'media', 'advertis', 'campaign', 'content'])]
    
    # Writing related terms
    writing_terms = [term for term, count in key_terms.items() 
                    if any(keyword in term.lower() for keyword in 
                          ['writ', 'content', 'blog', 'copywr', 'editor', 'translat'])]
    
    # Support related terms
    support_terms = [term for term, count in key_terms.items() 
                    if any(keyword in term.lower() for keyword in 
                          ['support', 'custom', 'servic', 'help', 'assist'])]
    
    # Management related terms
    mgmt_terms = [term for term, count in key_terms.items() 
                 if any(keyword in term.lower() for keyword in 
                       ['manag', 'project', 'product', 'coordinat', 'lead', 'director'])]
    
    # Finance related terms
    finance_terms = [term for term, count in key_terms.items() 
                    if any(keyword in term.lower() for keyword in 
                          ['financ', 'account', 'bookkeep', 'budget', 'tax', 'payrol'])]
    
    # Store discovered categories
    discovered_categories = {
        'Software Development': dev_terms,
        'Design & Creative': design_terms,
        'Data & Analytics': data_terms,
        'Marketing & Sales': marketing_terms,
        'Writing & Content': writing_terms,
        'Customer Support': support_terms,
        'Business & Management': mgmt_terms,
        'Finance & Accounting': finance_terms
    }
    
    print("\nDiscovered categories and their key terms:")
    for category, terms in discovered_categories.items():
        if terms:  # Only show categories with terms
            print(f"\n{category}:")
            print(f"  Key terms: {', '.join(terms[:10])}")  # Show first 10 terms
            print(f"  Total terms: {len(terms)}")
    
    return discovered_categories, key_terms

# Discover categories from data
discovered_categories, all_key_terms = create_data_driven_categories()



 Step 4: Discovering job categories from data...
Found 73 key terms in job titles

Top 20 most frequent terms:
  design: 27455 occurrences
  develop: 23490 occurrences
  websit: 17725 occurrences
  video: 16499 occurrences
  expert: 15388 occurrences
  need: 10681 occurrences
  manag: 10586 occurrences
  market: 10562 occurrences
  amp: 9690 occurrences
  specialist: 9148 occurrences
  app: 8843 occurrences
  editor: 8314 occurrences
  assist: 8312 occurrences
  media: 8268 occurrences
  creat: 7890 occurrences
  social: 7718 occurrences
  youtub: 7697 occurrences
  data: 7163 occurrences
  content: 6889 occurrences
  wordpress: 6787 occurrences

Discovered categories and their key terms:

Software Development:
  Key terms: full, develop, app, engin, websit, web, mobil, applic
  Total terms: 8

Design & Creative:
  Key terms: design, logo, build, graphic, brand
  Total terms: 5

Data & Analytics:
  Key terms: data, research
  Total terms: 2

Marketing & Sales:
  Key terms: media, mark

In [47]:
# Step 5: Apply data-driven categorization
print("\n Step 5: Applying data-driven job categorization...")

def categorize_job_title(processed_title, categories):
    """Categorize a job title based on discovered patterns"""
    title_lower = processed_title.lower()
    
    # Score each category
    category_scores = {}
    for category, terms in categories.items():
        score = sum(1 for term in terms if term in title_lower)
        if score > 0:
            category_scores[category] = score
    
    # Return category with highest score, or 'Other' if no match
    if category_scores:
        return max(category_scores, key=category_scores.get)
    else:
        return 'Other'

# Apply enhanced categorization
df['job_category'] = 'Other'  # Initialize

for idx, row in df.iterrows():
    if pd.notna(row['title_processed']) and len(str(row['title_processed']).strip()) > 0:
        df.loc[idx, 'job_category'] = categorize_job_title(row['title_processed'], discovered_categories)

# Additional pass: Check for uncategorized jobs and try alternative matching
uncategorized = df[df['job_category'] == 'Other']
print(f"\nFirst pass: {len(uncategorized)} jobs remain uncategorized")

if len(uncategorized) > 0:
    print("\nAnalyzing uncategorized jobs for patterns:")
    sample_uncategorized = uncategorized['title_original'].head(10).tolist()
    for i, title in enumerate(sample_uncategorized, 1):
        print(f"  {i:2d}. {title}")
    
    # Try to capture more with relaxed criteria
    for idx, row in uncategorized.iterrows():
        title_orig = str(row['title_original']).lower()
        
        # Manual rules for common patterns not caught
        if any(word in title_orig for word in ['media', 'buyer', 'ads', 'campaign', 'marketing', 'seo', 'social']):
            df.loc[idx, 'job_category'] = 'Marketing & Advertising'
        elif any(word in title_orig for word in ['app', 'web', 'development', 'developer', 'programming', 'coding']):
            df.loc[idx, 'job_category'] = 'Software Development'
        elif any(word in title_orig for word in ['design', 'designer', 'creative', 'visual', 'logo', '3d', 'graphic']):
            df.loc[idx, 'job_category'] = 'Design & Creative'
        elif any(word in title_orig for word in ['writer', 'writing', 'content', 'translation', 'portuguese', 'blog']):
            df.loc[idx, 'job_category'] = 'Writing & Content'
        elif any(word in title_orig for word in ['data', 'analyst', 'dashboard', 'report', 'looker', 'studio']):
            df.loc[idx, 'job_category'] = 'Data & Analytics'
        elif any(word in title_orig for word in ['support', 'customer', 'service', 'help', 'chat']):
            df.loc[idx, 'job_category'] = 'Customer Support'
        elif any(word in title_orig for word in ['manager', 'management', 'coordinator', 'assistant', 'executive']):
            df.loc[idx, 'job_category'] = 'Business & Management'
        elif any(word in title_orig for word in ['sales', 'hunter', 'talent', 'recruit', 'business development']):
            df.loc[idx, 'job_category'] = 'Sales & Business Development'
        elif any(word in title_orig for word in ['store', 'shop', 'ecommerce', 'shopify', 'product', 'optimization']):
            df.loc[idx, 'job_category'] = 'E-commerce & Retail'

final_uncategorized = len(df[df['job_category'] == 'Other'])
print(f"After second pass: {final_uncategorized} jobs remain as 'Other'")

print(" Enhanced job categorization completed")


 Step 5: Applying data-driven job categorization...

First pass: 93258 jobs remain uncategorized

Analyzing uncategorized jobs for patterns:
   1. Want to fix the WordPress Plugin
   2. URGENT: Fix Emails Not Working on Discourse Installation
   3. Shopify Store speed Optimization
   4. Promo Video for Game
   5. Report Analysis
   6. I'm looking for a person who knows how to write algorithms in the Chrome browser extension
   7. Linkedin Ads Coach
   8. Shopify Shop Implementation
   9. CapCut genius
  10. Convert single page figma to Nextjs - Urgently
After second pass: 70026 jobs remain as 'Other'
 Enhanced job categorization completed


In [49]:
# Step 6: Create specific subcategories using clustering
print("\n Step 6: Creating specific subcategories using text clustering...")

# Use TF-IDF to create more specific categories
vectorizer = TfidfVectorizer(
    max_features=100,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2
)

# Only process non-empty titles
valid_titles = df[df['title_processed'].str.len() > 0]['title_processed']

if len(valid_titles) > 10:  # Only if we have enough data
    try:
        tfidf_matrix = vectorizer.fit_transform(valid_titles)
        
        # Determine optimal number of clusters (max 15 or 10% of data)
        n_clusters = min(15, max(3, len(valid_titles) // 10))
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(tfidf_matrix)
        
        # Create cluster labels
        feature_names = vectorizer.get_feature_names_out()
        cluster_labels = []
        
        for i in range(n_clusters):
            # Get top terms for this cluster
            cluster_center = kmeans.cluster_centers_[i]
            top_indices = cluster_center.argsort()[-3:][::-1]  # Top 3 terms
            top_terms = [feature_names[idx] for idx in top_indices]
            
            # Create readable label
            label = ' '.join(top_terms).title()
            cluster_labels.append(f"Cluster_{i+1}_{label.replace(' ', '_')}")
        
        # Apply cluster labels to valid titles
        df['specific_category'] = 'Uncategorized'
        valid_indices = df[df['title_processed'].str.len() > 0].index
        
        for idx, cluster_id in zip(valid_indices, clusters):
            df.loc[idx, 'specific_category'] = cluster_labels[cluster_id]
            
        print(f" Created {n_clusters} specific subcategories using clustering")
        
    except Exception as e:
        print(f"  Clustering failed: {e}")
        df['specific_category'] = df['job_category']  # Fallback to broad categories
else:
    df['specific_category'] = df['job_category']  # Fallback to broad categories
    print("  Too few valid titles for clustering, using broad categories")


 Step 6: Creating specific subcategories using text clustering...
 Created 15 specific subcategories using clustering


In [53]:
# Step 7: Country data cleaning
print("\n Step 7: Cleaning country data...")

# Standardize country names based on actual data
country_counts = df['country'].value_counts()
print("\nCountries in dataset:")
for country, count in country_counts.head(10).items():
    print(f"  {country}: {count} jobs")

# Simple country cleaning (you can extend this based on your data)
country_mapping = {
    'United States': 'USA',
    'United Kingdom': 'UK',
    'Deutschland': 'Germany',
    'Brasil': 'Brazil',
    'España': 'Spain',
    'United Arab Emirates':'UAE'
}

df['country_clean'] = df['country'].replace(country_mapping)
df['country_clean'] = df['country_clean'].fillna('Unknown')

print(" Country data cleaning completed")


 Step 7: Cleaning country data...

Countries in dataset:
  United States: 99797 jobs
  United Kingdom: 19129 jobs
  India: 15825 jobs
  Australia: 12617 jobs
  Canada: 11655 jobs
  Pakistan: 5289 jobs
  Germany: 4838 jobs
  Netherlands: 4435 jobs
  United Arab Emirates: 4038 jobs
  France: 3161 jobs
 Country data cleaning completed


In [55]:
# Step 8: Create additional features
print("\n Step 8: Creating additional features...")

# Is weekend posting
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday'])

# Is evening posting (after 6 PM)
df['is_evening_post'] = df['hour'] >= 18

# Title length features
df['title_length'] = df['title_original'].str.len()
df['title_word_count'] = df['title_original'].str.split().str.len()

# Has specific keywords (discovered from data)
top_keywords = Counter(all_key_terms).most_common(20)
urgent_keywords = [kw for kw, count in top_keywords if 'urgent' in kw.lower()]
remote_keywords = [kw for kw, count in top_keywords if any(term in kw.lower() for term in ['remote', 'home'])]
senior_keywords = [kw for kw, count in top_keywords if any(term in kw.lower() for term in ['senior', 'lead', 'principal'])]
junior_keywords = [kw for kw, count in top_keywords if any(term in kw.lower() for term in ['junior', 'entry', 'intern'])]

df['has_urgent'] = df['title_original'].str.contains('urgent|asap|immediate', case=False, na=False)
df['has_remote'] = df['title_original'].str.contains('remote|work from home|wfh', case=False, na=False)
df['has_senior'] = df['title_original'].str.contains('senior|sr\.|lead|principal', case=False, na=False)
df['has_junior'] = df['title_original'].str.contains('junior|jr\.|entry|intern', case=False, na=False)

print(" Feature engineering completed")


 Step 8: Creating additional features...
 Feature engineering completed


In [57]:
# Step 9: Final data validation
print("\n Step 9: Final data validation...")

# Remove rows with critical missing data
initial_rows = len(df)
df = df.dropna(subset=['title_processed', 'published_date'])
df = df[df['title_processed'].str.len() > 0]  # Remove empty processed titles

print(f"Removed {initial_rows - len(df)} rows with critical missing data")
print(f"Final dataset size: {len(df)} rows")

# Data quality summary
print("\n" + "="*60)
print("DATA-DRIVEN CLEANING SUMMARY")
print("="*60)
print(f" Original rows: {initial_rows}")
print(f" Final rows: {len(df)}")
print(f" Total unique terms discovered: {len(all_key_terms)}")
print(f" Job categories created: {df['job_category'].nunique()}")
print(f" Specific subcategories: {df['specific_category'].nunique()}")
print(f" Countries found: {df['country_clean'].nunique()}")
print(f" Date range: {df['published_date'].min()} to {df['published_date'].max()}")


 Step 9: Final data validation...
Removed 0 rows with critical missing data
Final dataset size: 244656 rows

DATA-DRIVEN CLEANING SUMMARY
 Original rows: 244656
 Final rows: 244656
 Total unique terms discovered: 73
 Job categories created: 12
 Specific subcategories: 15
 Countries found: 213
 Date range: 2023-11-02 09:22:02+00:00 to 2024-03-24 14:16:47+00:00


In [59]:
# Save cleaned data
output_file = 'cleaned_job_data_driven.csv'
df.to_csv(output_file, index=False)
print(f"\n Cleaned data saved to: {output_file}")



 Cleaned data saved to: cleaned_job_data_driven.csv


In [61]:
# Display discovered categories summary
print("\n" + "="*60)
print("DATA-DRIVEN JOB CATEGORIES SUMMARY")
print("="*60)
category_summary = df['job_category'].value_counts()
print("Broad Categories (discovered from data):")
for category, count in category_summary.items():
    percentage = count / len(df) * 100
    print(f"  {category:<25} {count:>6} jobs ({percentage:>5.1f}%)")

print(f"\nSpecific Subcategories (Top 10):")
specific_summary = df['specific_category'].value_counts().head(10)
for category, count in specific_summary.items():
    percentage = count / len(df) * 100
    print(f"  {category:<35} {count:>6} jobs ({percentage:>5.1f}%)")



DATA-DRIVEN JOB CATEGORIES SUMMARY
Broad Categories (discovered from data):
  Other                      70026 jobs ( 28.6%)
  Software Development       56842 jobs ( 23.2%)
  Design & Creative          31152 jobs ( 12.7%)
  Marketing & Sales          20286 jobs (  8.3%)
  Writing & Content          18035 jobs (  7.4%)
  Customer Support           13278 jobs (  5.4%)
  Business & Management      11539 jobs (  4.7%)
  Data & Analytics           10117 jobs (  4.1%)
  Marketing & Advertising     3798 jobs (  1.6%)
  Sales & Business Development   3452 jobs (  1.4%)
  E-commerce & Retail         3377 jobs (  1.4%)
  Finance & Accounting        2754 jobs (  1.1%)

Specific Subcategories (Top 10):
  Cluster_1_Assist_Need_Data          128400 jobs ( 52.5%)
  Cluster_11_Develop_Stack_Web         14075 jobs (  5.8%)
  Cluster_15_Websit_Develop_Design     12530 jobs (  5.1%)
  Cluster_4_Expert_Googl_Seo           12279 jobs (  5.0%)
  Cluster_8_Design_Web_Product         11481 jobs (  4.7%)
  C

In [63]:
# Show most common terms by category
print("\n" + "="*60)
print("TOP TERMS BY DISCOVERED CATEGORY")
print("="*60)
for category, terms in discovered_categories.items():
    if terms and category in df['job_category'].values:
        job_count = len(df[df['job_category'] == category])
        print(f"\n{category} ({job_count} jobs):")
        # Show terms sorted by frequency
        category_terms = [(term, all_key_terms[term]) for term in terms[:10]]
        category_terms.sort(key=lambda x: x[1], reverse=True)
        print(f"  Top terms: {', '.join([f'{term}({count})' for term, count in category_terms[:5]])}")



TOP TERMS BY DISCOVERED CATEGORY

Software Development (56842 jobs):
  Top terms: develop(23490), websit(17725), app(8843), web(6764), engin(3657)

Design & Creative (31152 jobs):
  Top terms: design(27455), graphic(6290), logo(5869), build(5508), brand(4610)

Data & Analytics (10117 jobs):
  Top terms: data(7163), research(3723)

Marketing & Sales (20286 jobs):
  Top terms: market(10562), media(8268), social(7718), content(6889), seo(5084)

Writing & Content (18035 jobs):
  Top terms: editor(8314), content(6889), writer(5625), translat(3933)

Customer Support (13278 jobs):
  Top terms: assist(8312), custom(4453)

Business & Management (11539 jobs):
  Top terms: manag(10586), product(6707), lead(4800)

Finance & Accounting (2754 jobs):
  Top terms: account(4279)


In [41]:
# Display sample of cleaned data
print("\n" + "="*60)
print("SAMPLE OF CLEANED DATA")
print("="*60)
sample_columns = ['title_original', 'title_processed', 'job_category', 'specific_category', 
                 'country_clean', 'avg_hourly_rate', 'budget', 'is_hourly', 'year', 'month']
print(df[sample_columns].head(10))

print("\n Data-driven cleaning completed successfully!")
print(f"Next step: Run your EDA analysis with: {output_file}")
print("\nKey improvements in this data-driven approach:")
print("1. Categories discovered from actual job titles in your dataset")
print("2. Term frequency analysis to identify most relevant keywords")
print("3.  Automatic clustering for specific subcategories")
print("4. Adaptive to your specific dataset characteristics")


SAMPLE OF CLEANED DATA
                                      title_original  \
0  Experienced Media Buyer For Solar Pannel and R...   
1                               Full Stack Developer   
2                                    SMMA Bubble App   
3             Talent Hunter Specialized in Marketing   
4                                      Data Engineer   
5               SEO for Portuguese Psychologist site   
6                   Want to fix the WordPress Plugin   
7  need Portuguese writers who can understand and...   
8  Looker Studio Dashboard for Leadgen and E-Comm...   
9  PHP/HTML/CSS WordPress Developer Needed for We...   

                                     title_processed           job_category  \
0  experienc media buyer solar pannel roof instal...      Marketing & Sales   
1                                 full stack develop   Software Development   
2                                     smma bubbl app   Software Development   
3                       talent hunter speci