In [1]:
import pandas as pd
import constants as cs
import numpy as np
import matplotlib.pyplot as plt
import zipfile
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.neural_network import MLPRegressor
import joblib
import pickle

In [2]:
def load_data(filename: str) -> pd.DataFrame:
    with zipfile.ZipFile(f"../{filename}.zip") as data_zip:
        with data_zip.open(f"{filename}.csv") as developer_data:
            return pd.read_csv(developer_data)

In [3]:
na_data = load_data(cs.NA_TRAIN_DATA)

compensation = na_data[cs.COMPENSATION]
X_data = na_data.drop(columns=[cs.COMPENSATION])

print(X_data.columns)
# Create columns for job title and organisation size
X_data = pd.get_dummies(X_data)
print(X_data.columns)
print(X_data.shape)

X_train, X_valid, y_train, y_valid = train_test_split(X_data, compensation)

Index(['Age', 'EdLevel', 'YearsCode', 'YearsCodePro', 'DevType', 'OrgSize',
       'Country', 'Industry', 'JavaScript', 'SQL', 'HTML/CSS', 'Python',
       'TypeScript', 'PostgreSQL', 'MySQL', 'Microsoft SQL Server', 'SQLite',
       'Redis', 'Docker', 'npm', 'Homebrew', 'Pip', 'Webpack', 'React',
       'Node.js', 'jQuery', 'ASP.NET CORE', 'Angular',
       'Amazon Web Services (AWS)', 'Microsoft Azure', 'Google Cloud',
       'Cloudflare', 'Digital Ocean', '.NET (5+) ',
       '.NET Framework (1.0 - 4.8)', 'Pandas', 'NumPy', 'Apache Kafka'],
      dtype='object')
Index(['Age', 'EdLevel', 'YearsCode', 'YearsCodePro', 'Country', 'Industry',
       'JavaScript', 'SQL', 'HTML/CSS', 'Python', 'TypeScript', 'PostgreSQL',
       'MySQL', 'Microsoft SQL Server', 'SQLite', 'Redis', 'Docker', 'npm',
       'Homebrew', 'Pip', 'Webpack', 'React', 'Node.js', 'jQuery',
       'ASP.NET CORE', 'Angular', 'Amazon Web Services (AWS)',
       'Microsoft Azure', 'Google Cloud', 'Cloudflare', 'Digital Oc

In [4]:
 # Feature scaling for models that need scaled data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [5]:
def custom_accuracy_score(y_true, y_pred):
        accuracy_count = 0
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= 15000:
                accuracy_count += 1

        accuracy = accuracy_count / len(y_true)
        return accuracy

In [None]:


# Defining the parameter grid
rf_param_grid = {
    'n_estimators': [150, 200],
    'max_depth': [75, 100, 125, 150],
    'min_samples_leaf': [10, 20, 30]
}
# Create an empty list to store the results of each model and also the best model so it can be resused
results = []
rf_models = []

# Iterate through the parameter grid and perform GridSearchCV for each combination
for params in ParameterGrid(rf_param_grid):
    # Create and fit the model
    rf_model = RandomForestRegressor(**params)
    rf_model.fit(X_train, y_train)
    rf_models.append(rf_model)

    rf_custom_score_train = custom_accuracy_score(y_train, rf_model.predict(X_train))
    rf_custom_score_valid = custom_accuracy_score(y_valid, rf_model.predict(X_valid))

    # Append the results to the list
    results.append({
        'params': params,
        'train_score': rf_custom_score_train,
        'valid_score': rf_custom_score_valid
    })

# Plot the results on a graph
train_scores = [result['train_score'] for result in results]
valid_scores = [result['valid_score'] for result in results]

plt.figure(figsize=(10, 6))
plt.plot(range(len(results)), train_scores, label='Training Score', marker='o')
plt.plot(range(len(results)), valid_scores, label='Validation Score', marker='o')
plt.xticks(range(len(results)), [str(result['params']) for result in results], rotation=45)
plt.xlabel('Model Parameters')
plt.ylabel('Custom Score')
plt.legend()
plt.tight_layout()
plt.savefig(f"{cs.ML_MODELS_FOLDER}/{cs.RANDOM_FOREST_MODELS}.png")

In [None]:
best_rf_model = rf_models[7]
best_rf_model_filename = 'best_rf_model.pkl'
joblib.dump(best_rf_model, best_rf_model_filename)

In [None]:
# Training with an svr model 

results = []
svr_models = []

svr_param_grid = {
        'C': [1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    }

for params in ParameterGrid(svr_param_grid):
    # Set the parameters for the RandomForestRegressor in the pipeline
    svr_model = SVR(**params)
    svr_model.fit(X_train_scaled, y_train)
    svr_models.append(svr_model)

    # Calculate custom scoring for training and validation data
    svr_custom_score_train = custom_accuracy_score(y_train, svr_model.predict(X_train_scaled))
    svr_custom_score_valid = custom_accuracy_score(y_valid, svr_model.predict(X_valid_scaled))

    # Append the results to the list
    results.append({
        'params': params,
        'train_score': svr_custom_score_train,
        'valid_score': svr_custom_score_valid
    })

# Plot the results on a graph
train_scores = [result['train_score'] for result in results]
valid_scores = [result['valid_score'] for result in results]

plt.figure(figsize=(10, 6))
plt.plot(range(len(results)), train_scores, label='Training Score', marker='o')
plt.plot(range(len(results)), valid_scores, label='Validation Score', marker='o')
plt.xticks(range(len(results)), [str(result['params']) for result in results], rotation=45)
plt.xlabel('Model Parameters')
plt.ylabel('Custom Score')
plt.legend()
plt.tight_layout()
plt.savefig(f"./ml_models/SVR_models.png")

In [None]:
best_svr_model = svr_models[16]
best_svr_model_filename = 'best_svr_model.pkl'
joblib.dump(best_svr_model, best_svr_model_filename)

In [None]:
results = []
gb_models = []
gb_param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 10]
    }
for params in ParameterGrid(gb_param_grid):
    gb_model = GradientBoostingRegressor(**params)
    gb_model.fit(X_train, y_train)
    gb_models.append(gb_model)

    # Calculate custom scoring for training and validation data
    gb_custom_score_train = custom_accuracy_score(y_train, gb_model.predict(X_train))
    gb_custom_score_valid = custom_accuracy_score(y_valid, gb_model.predict(X_valid))

    # Append the results to the list
    results.append({
        'params': params,
        'train_score': gb_custom_score_train,
        'valid_score': gb_custom_score_valid
    })

# Plot the results on a graph
train_scores = [result['train_score'] for result in results]
valid_scores = [result['valid_score'] for result in results]

plt.figure(figsize=(10, 6))
plt.plot(range(len(results)), train_scores, label='Training Score', marker='o')
plt.plot(range(len(results)), valid_scores, label='Validation Score', marker='o')
plt.xticks(range(len(results)), [str(result['params']) for result in results], rotation=45)
plt.xlabel('Model Parameters')
plt.ylabel('Custom Score')
plt.legend()
plt.tight_layout()
plt.savefig(f"./ml_models/Gradient_models.png")

In [None]:
best_gb_model = svr_models[12]
best_gb_model_filename = 'best_gb_model.pkl'
joblib.dump(best_gb_model, best_gb_model_filename)

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Calculate custom scoring for training and validation data
linear_custom_score_train = custom_accuracy_score(y_train, linear_model.predict(X_train))
linear_custom_score_valid = custom_accuracy_score(y_valid, linear_model.predict(X_valid))

print("Linear Regression - Custom Score on training data:", linear_custom_score_train)
print("Linear Regression - Custom Score on validation data:", linear_custom_score_valid)