In [46]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [65]:
data = pd.read_csv('salaries_2.csv')

In [48]:
missing_values = data.isnull().sum()

missing_percentage = (missing_values / len(data)) * 100

missing_data_df = pd.DataFrame({'Feature': data.columns, 'Missing Values (%)': missing_percentage})

missing_data_df = missing_data_df[missing_data_df['Missing Values (%)'] > 0]

missing_data_df = missing_data_df.sort_values(by='Missing Values (%)', ascending=False)


In [49]:
missing_data_df

Unnamed: 0,Feature,Missing Values (%)


In [50]:
X = data.drop('experience_level', axis=1) 
y = data['experience_level']

In [51]:
X = pd.get_dummies(X, drop_first=True)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [54]:
rf_model.fit(X_train, y_train)

In [55]:
y_pred = rf_model.predict(X_test)

In [56]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.71


In [57]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          EN       0.48      0.40      0.43       255
          EX       0.62      0.43      0.51       104
          MI       0.51      0.48      0.49       780
          SE       0.80      0.85      0.82      2168

    accuracy                           0.71      3307
   macro avg       0.60      0.54      0.56      3307
weighted avg       0.70      0.71      0.70      3307



In [58]:
# Feature importance (which features the model finds most important)
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))


                                       Feature  Importance
2                                salary_in_usd    0.311994
1                                       salary    0.301883
0                                    work_year    0.061735
3                                 remote_ratio    0.027981
52                      job_title_Data Analyst    0.016701
..                                         ...         ...
284                        company_location_CN    0.000000
160  job_title_Staff Machine Learning Engineer    0.000000
43                  job_title_CRM Data Analyst    0.000000
28                  job_title_BI Data Engineer    0.000000
228                      employee_residence_JE    0.000000

[348 rows x 2 columns]


In [59]:
t2_df = pd.read_csv('DataEngineer.csv')
t2New_df = pd.DataFrame()

In [60]:
def extract_salary(salary_str):
    match = re.search(r'\$(\d+)K-\$(\d+)K', salary_str)
    if match:
        lower = int(match.group(1)) * 1000
        upper = int(match.group(2)) * 1000
        return int((lower + upper) / 2)
    return None

In [61]:
def extract_work_year_ratio(df):
    year_counts = df['work_year'].value_counts(normalize=True)
    return year_counts

def generate_work_years(n, year_ratio):
    years = np.random.choice(year_ratio.index, size=n, p=year_ratio.values)
    return years

In [62]:
year_ratio = extract_work_year_ratio(data)

def map_company_size(size_str):
    if '1 to 50 employees' in size_str or '51 to 200 employees' in size_str:
        return 'S'
    elif '201 to 500 employees' in size_str or '501 to 1000 employees' in size_str:
        return 'M'
    elif '1001 to 5000 employees' in size_str or '5001 to 10000 employees' in size_str or '10000+ employees' in size_str:
        return 'L'
    return 'NA'

In [63]:
t2New_df_dict = {
    'work_year': generate_work_years(len(t2_df), year_ratio),
    'employment_type': 'FT',  # placeholder. Full Time
    'job_title': t2_df['Job Title'],
    'salary': t2_df['Salary Estimate'].apply(extract_salary),
    'salary_currency': 'USD',
    'salary_in_usd': t2_df['Salary Estimate'].apply(extract_salary),
    'employee_residence': t2_df['Location'],
    'remote_ratio': 0,
    'company_location': t2_df['Location'],
    'company_size': t2_df['Size'].apply(map_company_size)
}
t2New_df = pd.DataFrame(t2New_df_dict)


In [69]:
t2New_df_processed = pd.get_dummies(t2New_df, drop_first=True)

# Ensure the columns match between the data and t2New_df
missing_cols = set(X.columns) - set(t2New_df_processed.columns)
for col in missing_cols:
    t2New_df_processed[col] = 0
t2New_df_processed = t2New_df_processed[X.columns]


  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df_processed[col] = 0
  t2New_df

In [73]:
t2New_df['experience_level'] = rf_model.predict(t2New_df_processed)

In [74]:
# Retrain the model on the combined dataset
rf_model.fit(t2New_df_processed, t2New_df['experience_level'])

# Evaluate the model again
y_pred_retrained = rf_model.predict(X_test)
retrained_accuracy = accuracy_score(y_test, y_pred_retrained)
print(f'Retrained accuracy: {retrained_accuracy:.2f}')
print(classification_report(y_test, y_pred_retrained))

Retrained accuracy: 0.64
              precision    recall  f1-score   support

          EN       0.33      0.19      0.24       255
          EX       0.00      0.00      0.00       104
          MI       0.38      0.39      0.38       780
          SE       0.74      0.81      0.78      2168

    accuracy                           0.64      3307
   macro avg       0.36      0.35      0.35      3307
weighted avg       0.60      0.64      0.62      3307



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
