In [1]:
import pandas as pd
import numpy as np
import time
import pickle
!pip install xgboost
!pip install lightgbm
!pip install catboost



from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [2]:
#READ THE DATA FILE
df1 = pd.read_csv("C:/Users/anmol/Downloads/SalarySense/survey_results_public.csv")

In [3]:
#ADD COLUMNS HERE

#Employment Column Additions
column_name = "Employment"
st = set()
for i in range (0,len(df1)):
    value = str(df1[column_name].iloc[i])
    if(value == "nan"):continue
    l = value.split(";")
    for ele in l:
        st.add(ele)
for ele in st:
    df1[ele] = 0
    
for i in range (0,len(df1)):
    value = str(df1[column_name].iloc[i])
    if(value == "nan"):continue
    l = value.split(";")
    for ele in l:
        df1.loc[i,ele] = 1


In [4]:
#Countries filtered out if not India
df1 = df1[df1["Industry"] == "Information Services, IT, Software Development, or other Technology"]
#Industries filtered out if not Information Services, IT, Software Development, or other Technology
df1 = df1[df1["Country"] == "India"]


In [6]:
#Number of Languages known
column_name = "LanguageHaveWorkedWith"
def languagecount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfLanguagesKnown"] = df1.apply(languagecount,axis = 1)

#Number of Languages known
column_name = "PlatformHaveWorkedWith"
def platformcount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfPlatformsKnown"] = df1.apply(platformcount,axis = 1)

#Number of Languages known
column_name = "WebframeHaveWorkedWith"
def webframecount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfWebframesKnown"] = df1.apply(webframecount,axis = 1)

#Number of Languages known
column_name = "MiscTechHaveWorkedWith"
def misctechcount(row):
    value = str(row[column_name]).split(";")
    if(value[0] == "nan"):return 0
    return len(value)
df1["NumberOfMiscTechsKnown"] = df1.apply(misctechcount,axis = 1)

In [7]:
columns_selected =  [
   'Age',
   'OrgSize',
   'DevType',
   'YearsCode',
   'WorkExp', 
   "RemoteWork",
   'Currency',
   "EdLevel",
   "ConvertedCompYearly",
   "NumberOfLanguagesKnown",
   # "NumberOfPlatformsKnown",
   # "NumberOfWebframesKnown",
   # "NumberOfMiscTechsKnown",
]

train_columns = [
   'Age',
   'OrgSize',
   'DevType',
   'Currency',
   "EdLevel",
   "ExperienceCategory",
   "YearsCodeCategory",
   "NumberOfLanguagesKnown",
   # "NumberOfPlatformsKnown",
   # "NumberOfWebframesKnown",
   # "NumberOfMiscTechsKnown",
]

df1 = df1[columns_selected]

df1 = df1.dropna()

In [8]:
#CATEGORISE COLUMNS  INTO MAJORITY VALUES AND 'OTHER'
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map


currency_map = shorten_categories(df1.Currency.value_counts(), 400)
df1['Currency'] = df1['Currency'].map(currency_map)

In [9]:
#CATEGORISE THE WORK EXPERIENCE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1['ExperienceCategory'] = pd.cut(df1['WorkExp'], bins=bins, labels=labels)

In [10]:
#CATEGORISE LESS THAN 1 YEAR AS 0 AND MORE THAN 50 AS 51 FOR YEARS OF CODE
df1['YearsCode'] = df1['YearsCode'].replace("Less than 1 year", 0)
df1['YearsCode'] = df1['YearsCode'].replace("More than 50 years", 51)


In [11]:
#CATEGORISE YEARS OF CODE INTO BINS
bins = [0, 2, 5, 10, 20, 30, 40, 50, float('inf')]  # Define custom bin edges
labels = [0, 1, 2, 3, 4, 5, 6, 7]  # Define labels

# Create a new column with the categories
df1["YearsCode"] = df1["YearsCode"].astype(int)
df1['YearsCodeCategory'] = pd.cut(df1['YearsCode'], bins=bins, labels=labels)

In [12]:
#LABEL ENCODE THE COLUMNS
label_encoders = {}
df_LE = df1.copy()
df_LE = df_LE.dropna()

for i in train_columns:
    if i == "ConvertedCompYearly":
        continue
        
    label_encoders[i] = LabelEncoder()
    df_LE[i] = label_encoders[i].fit_transform(df_LE[i])
    
    
X = df_LE[train_columns]

Y = df_LE["ConvertedCompYearly"]

In [13]:
#TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
classifiers = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=1.0),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
}

In [15]:
#CHECK IMPORTANCE
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X, Y)
feature_importances = model.feature_importances_

map_ = {}

for i, a in enumerate(X):
    map_[a] = feature_importances[i]
    
sorted(map_.items(), key = lambda x: x[1])

[('Currency', 0.033092424297798836),
 ('Age', 0.05185821483402885),
 ('EdLevel', 0.0565644502032184),
 ('YearsCodeCategory', 0.08012277099372725),
 ('OrgSize', 0.1344069684650619),
 ('DevType', 0.18515024499010269),
 ('ExperienceCategory', 0.19743719759776382),
 ('NumberOfLanguagesKnown', 0.26136772861829827)]

In [16]:
# Create an empty DataFrame to store the results
columns = ['Model', 'Run Time (minutes)', 'MAE', 'MSE', 'RMSE', 'R2']
df_models = pd.DataFrame(columns=columns)

# Loop through your regression models
for key, clf in classifiers.items():
    # STARTING TIME
    start_time = time.time()
    # TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_scaled, y_train)
    
    #SAVE THE TRAINED MODEL
    classifiers[key] = clf
    
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_scaled)
    
    # CALCULATE REGRESSION METRICS
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
    r2 = r2_score(y_test, predictions)

    row = {'Model': key,
           'Run Time (minutes)': round((time.time() - start_time) / 60, 2),
           'MAE': mae,
           'MSE': mse,
           'RMSE': rmse,
           'R2': r2
           }

    df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

# Sort the DataFrame by R-squared (R2) in descending order
df_models = df_models.sort_values(by='R2', ascending=False)

# PRINT THE MODELS WITH REGRESSION METRICS [SORTED]
print(df_models)

  df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)


               Model  Run Time (minutes)           MAE           MSE  \
0  Linear Regression                0.00  19454.983855  2.282155e+09   
2              Lasso                0.00  19454.819851  2.282183e+09   
1              Ridge                0.00  19454.417565  2.282269e+09   
4      Random Forest                0.01  22092.239390  2.773640e+09   
3      Decision Tree                0.00  26598.600379  3.667098e+09   

           RMSE        R2  
0  47771.905325  0.155833  
2  47772.194164  0.155823  
1  47773.097039  0.155791  
4  52665.354266 -0.025967  
3  60556.567347 -0.356456  


