In [27]:
# ------------------------------------------------------
#Load Dataset
# ------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(r"C:\Users\SAYA\OneDrive\Desktop\WhatsApp\global_freelancers_raw.csv")
print(df.shape)

(1000, 12)


In [28]:
# =========================================
# 3.1 CLEANING STEPS
# =========================================

# -------- Inconsistent Numeric Formats --------
# hourly_rate (USD): remove symbols like $, USD, commas
df.loc[:, 'hourly_rate (USD)'] = (
    df['hourly_rate (USD)']
    .astype(str)
    .str.replace(r'[^0-9.]', '', regex=True)
)

df.loc[:, 'hourly_rate (USD)'] = pd.to_numeric(
    df['hourly_rate (USD)'], errors='coerce'
)

# -------- Inconsistent Categorical Formats --------

# Standardize is_active
df.loc[:, 'is_active'] = (
    df['is_active']
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        'yes': 1, 'y': 1, 'true': 1, '1': 1,
        'no': 0, 'n': 0, 'false': 0, '0': 0
    })
)

# Standardize gender → Female / Male
df.loc[:, 'gender'] = (
    df['gender']
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        'f': 'Female',
        'female': 'Female',
        'm': 'Male',
        'male': 'Male'
    })
)

# -------- Missing Values Handling --------

# Numerical columns → median
numerical_cols = ['age', 'years_of_experience', 'hourly_rate (USD)', 'rating']

for col in numerical_cols:
    df.loc[:, col] = df[col].astype(float)
    df.loc[:, col] = df[col].fillna(df[col].median())

# Categorical columns → mode
df.loc[:, 'is_active'] = df['is_active'].fillna(df['is_active'].mode()[0])
df.loc[:, 'gender'] = df['gender'].fillna(df['gender'].mode()[0])

# -------- Target Variable Handling --------
# Remove records with missing client_satisfaction
df = df.dropna(subset=['client_satisfaction'])

print("\nAfter Cleaning Steps (3.1)")
print(df.head())
print("\nAfter Cleaning Steps Dataset Shape:", df.shape)
print(df.info())

# =========================================
# 3.2 TRANSFORMATION TECHNIQUES
# =========================================

# -------- Target Variable Formatting --------
# Remove % and convert to numeric
df.loc[:, 'client_satisfaction'] = (
    df['client_satisfaction']
    .astype(str)
    .str.replace('%', '')
    .astype(float)
)

# -------- Standardization --------
scaler = StandardScaler()

num_features = ['years_of_experience', 'hourly_rate (USD)', 'rating']

df.loc[:, num_features] = scaler.fit_transform(df[num_features])

# -------- One-Hot Encoding --------
df = pd.get_dummies(
    df,
    columns=['country', 'language', 'primary_skill'],
    drop_first=True
)

print("Dataset Shape After Encoding:", df.shape)

# =========================================================
# 3.3 FEATURE PREPARATION METHODS
# =========================================================

# Separate target variable
y = df['client_satisfaction']

# Separate predictor variables
X = df.drop(columns=['client_satisfaction'])

# Final cleaned dataset ready for regression
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

print("Data preprocessing completed successfully.")


After Cleaning Steps (3.1)
  freelancer_ID             name  gender   age    country    language  \
1      FL250002   Vanessa Garcia  Female  52.0  Australia     English   
2      FL250003      Juan Nelson    Male  53.0    Germany      German   
3      FL250004   Amanda Spencer  Female  38.0  Australia     English   
4      FL250005  Lynn Curtis DDS  Female  53.0    Germany      German   
6      FL250007       Eric Myers    Male  52.0  Indonesia  Indonesian   

     primary_skill  years_of_experience hourly_rate (USD)  rating is_active  \
1      Mobile Apps                 34.0             100.0     3.3         1   
2   Graphic Design                 31.0              50.0     0.0         0   
3  Web Development                  4.0              40.0     1.5         0   
4  Web Development                 27.0              30.0     4.8         0   
6    Data Analysis                 10.0              75.0     3.1         0   

  client_satisfaction  
1                 84%  
2         

In [29]:
# ------------------------------------------------------
# FIX: Drop unique identifiers that cannot be converted to numbers
# ------------------------------------------------------

# If 'freelancer_ID' or 'name' are still in X, drop them
cols_to_drop = ['freelancer_ID', 'name']
for col in cols_to_drop:
    if col in X.columns:
        X = X.drop(columns=[col])

In [30]:
# =========================================================
# 4. PREDICTIVE MODELLING
# =========================================================
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor  # New Library

# --- Safety Check: Ensure no text columns remain in X ---
X = X.select_dtypes(include=[np.number, 'bool'])
if 'is_active' in X.columns:
    X['is_active'] = pd.to_numeric(X['is_active'], errors='coerce').fillna(0).astype(int)

# 4.1 Define Three Train-Test Split Configurations
X_train80, X_test80, y_train80, y_test80 = train_test_split(X, y, test_size=0.2, random_state=42)
X_train70, X_test70, y_train70, y_test70 = train_test_split(X, y, test_size=0.3, random_state=42)
X_train60, X_test60, y_train60, y_test60 = train_test_split(X, y, test_size=0.4, random_state=42)

model_LR = LinearRegression()
model_DT = DecisionTreeRegressor(max_depth=5, random_state=42)
model_RF = RandomForestRegressor(n_estimators=100, random_state=42) # 100 trees

print("Part 4: Linear Regression, Decision Tree, and Random Forest models initialized.")

Part 4: Linear Regression, Decision Tree, and Random Forest models initialized.


In [31]:
# =========================================================
# 5. MODEL PERFORMANCE EVALUATION
# =========================================================
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

performance_results = []

def evaluate_splits(X_train, X_test, y_train, y_test, label):
    # Dictionary of models to loop through
    current_models = {
        "Linear Regression": model_LR,
        "Decision Tree": model_DT,
        "Random Forest": model_RF
    }
    
    for name, model in current_models.items():
        # Train
        model.fit(X_train, y_train)
        # Predict
        preds = model.predict(X_test)
        
        # Metrics
        mae = mean_absolute_error(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        r2 = r2_score(y_test, preds)
        
        performance_results.append({
            "Split Strategy": label, 
            "Model": name, 
            "MAE": round(mae, 4), 
            "RMSE": round(rmse, 4), 
            "R2": round(r2, 4)
        })

# Run evaluation
evaluate_splits(X_train80, X_test80, y_train80, y_test80, "80:20 Split")
evaluate_splits(X_train70, X_test70, y_train70, y_test70, "70:30 Split")
evaluate_splits(X_train60, X_test60, y_train60, y_test60, "60:40 Split")

# Display Table
df_performance = pd.DataFrame(performance_results)
print("\n--- Performance Comparison Table ---")
print(df_performance)


--- Performance Comparison Table ---
  Split Strategy              Model      MAE     RMSE      R2
0    80:20 Split  Linear Regression  10.9913  12.6473 -0.1090
1    80:20 Split      Decision Tree  10.9905  12.7365 -0.1247
2    80:20 Split      Random Forest  11.3336  13.1142 -0.1924
3    70:30 Split  Linear Regression  10.4849  12.1761 -0.0841
4    70:30 Split      Decision Tree  10.5466  12.3955 -0.1235
5    70:30 Split      Random Forest  10.6615  12.4901 -0.1407
6    60:40 Split  Linear Regression  10.3394  11.9911 -0.0546
7    60:40 Split      Decision Tree  10.6224  12.3484 -0.1184
8    60:40 Split      Random Forest  10.4171  12.1655 -0.0855


In [35]:
# =========================================================
# 6. BEST MODEL SELECTION
# =========================================================

best_model_info = df_performance.loc[df_performance['MAE'].idxmin()]

print("--- Best Model Results ---")
print(f"Recommended Model: {best_model_info['Model']}")
print(f"Optimal Split Ratio: {best_model_info['Split Strategy']}")
print(f"Lowest MAE: {best_model_info['MAE']}")
print(f"The {best_model_info['Model']} was identified as the best performer. ")


--- Best Model Results ---
Recommended Model: Linear Regression
Optimal Split Ratio: 60:40 Split
Lowest MAE: 10.3394
The Linear Regression was identified as the best performer. 


In [36]:
import pickle

filename = 'best_linear_regression_model.sav'

model_LR.fit(X_train60, y_train60)

with open(filename, 'wb') as file:
    pickle.dump(model_LR, file)

print(f"Success! Best model saved as: {filename}")

Success! Best model saved as: best_linear_regression_model.sav
