In [1]:
import gzip
from collections import defaultdict
import sklearn
from sklearn import linear_model
import string
import re
import datetime, time
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as md
import math

In [2]:
#####
# Setup
#####

In [2]:
def readCSV(path):
    f = open(path, 'rt')
    f.readline()

    for l in f:
        
        if l.startswith("timestamp"): 
            continue

        try: 
            timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education = re.split(r',(?![ ])', l)

        except:
            continue

        yield timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education

In [3]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [5]:
#####
# Parse data from dataset
#####

In [4]:
# id , unixtime, company, level, title, total_comp, city, state, experience, tenure, tag, 
# salary, stock, bonus, city_id, dma_id, ms_deg, bs_deg, phd_deg, hs, college
def parseSalaryRaw(salaryRaw):
    salaryAllData = []

    for entry in salaryRaw:

        date, clocktime = entry[0].split(' ')
        date = date.split('/')
        clocktime = clocktime.split(':')
        timestamp = datetime.datetime(int(date[2]), int(date[0]), int(date[1]), int(clocktime[0]), int(clocktime[1]), int(clocktime[2]))
        timestamp = int(time.mktime(timestamp.timetuple()))


        company = entry[1]
        level = entry[2]
        title = entry[3]

        total_comp = int(float(entry[4]))

        location = entry[5]
        if location.count(',') == 2:
            city, state, country = location.strip('"').split(', ')
        elif location.count(',') == 1:
            city, state = location.strip('"').split(', ')
            country = 'USA'

        experience = entry[6]
        tenure = entry[7]

        tag = entry[8]

        salary = int(float(entry[9]))
        if salary == 0:
            continue

        stock = int(float(entry[10]))
        bonus = int(float(entry[11]))

        g = entry[12]
        gender = -1
        if g == 'Female':
            gender = 0
        elif g == 'Male':
            gender = 1
        elif g == 'Other':
            gender = 2

        city_id = int(entry[14])

        try: 
            dma_id = int(entry[15])
        except:
            dma_id = -1

        id = int(entry[16])

        ms_deg = int(entry[17])
        bs_deg = int(entry[18])
        phd_deg = int(entry[18])
        hs = int(entry[19])
        college = int(entry[20])

        r = entry[27]
        race = -1
        if r == 'Asian':
            race = 0
        elif r == 'Black':
            race = 1
        elif r == 'Hispanic':
            race = 2
        elif r == 'Two Or More':
            race = 3
        elif r == 'White':
            race = 4


        salaryAllData.append({
            'id': id,
            'timestamp': timestamp,
            'company': company,
            'level': level,
            'title': title,
            'total_comp': total_comp,
            'city': city,
            'state': state,
            'country': country,
            'experience': experience,
            'tenure': tenure,
            'tag': tag,
            'salary': salary,
            'stock': stock,
            'bonus': bonus,
            'gender': gender,
            'city_id': city_id,
            'dma_id': dma_id,
            'ms_deg': ms_deg,
            'bs_deg': bs_deg,
            'phd_deg': phd_deg,
            'hs': hs,
            'college': college,
            'race': race,
        })
    return salaryAllData


In [5]:
# Reparse data from CSV so we don't mess up above data
salaryRaw = []
for l in readCSV("Levels_Fyi_Salary_Data.csv"):
    salaryRaw.append(l)

salaryAllData = parseSalaryRaw(salaryRaw)
salaryAllData[0]['id']

1

In [6]:
# Parse raw data
salaryAllData = parseSalaryRaw(salaryRaw)

In [7]:
#####
# Training, Validation, and Test sets
#####

# shuffle data
shuffledAllData = sklearn.utils.shuffle(salaryAllData)
shuffledAllData[0]['id']

# create 80 / 10 / 10 : Train / Validation / Test sets
X = shuffledAllData
y = [d['salary'] for d in shuffledAllData]

X_train, X_temp, y_train, y_temp = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
X_valid, X_test, y_valid, y_test = sklearn.model_selection.train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

# Validate that sets were created with propper sizes
print(str(len(X_train)) + ", " + str(len(y_train)))
print(str(len(X_valid)) + ", " + str(len(y_valid)))
print(str(len(X_test)) + ", " + str(len(y_test)))

# Validate all ids are different
print(str(X_train[0]['id']) + ", " + str(X_valid[0]['id']) + ", " + str(X_test[0]['id']))

47996, 47996
6000, 6000
6000, 6000
76391, 83046, 33750


In [10]:
#####
# Baselines
#   1. Predict average salary in dataset
#   2. Predict average salary for given Years of Experience
#####

In [37]:
# Create some useful collections for these baseline

# create YOE to Average Map based on training data
# We create two dictionaries to track the total salary seen at each YOE
salByExpMap = defaultdict(int)

# And the number of salaries seen at that YOE
entriesByExpMap = defaultdict(int)

# For each entry increment the total sal and num entries at that YOE
# (This could be done with a running avg technique but that is too much work to code)
for x in X_train:
    exp, sal = float(x['experience']), float(x['salary'])
    exp = math.ceil(exp)
    
    total = 0
    numEntries = 0
    
    if salByExpMap[exp]:
        total = salByExpMap[exp]
    if entriesByExpMap[exp]:
        numEntries = entriesByExpMap[exp]
    
    total += sal
    numEntries += 1
    
    salByExpMap[exp] = total
    entriesByExpMap[exp] = numEntries
    

    
# Calculate averages by YOE as well as total avg
totalSal = 0
totalCount = 0

averageSalaryByYoeMap = defaultdict(int)
for exp in entriesByExpMap:
    avg = salByExpMap[exp] / entriesByExpMap[exp]
    averageSalaryByYoeMap[exp] = avg
    
    totalSal += salByExpMap[exp]
    totalCount += entriesByExpMap[exp]

avgSal = totalSal / totalCount

print("Average salary: " + str(avgSal))


def pred_baseline1(x):    
    return avgSal
    
def pred_baseline2(x):    
    exp = math.ceil(float(x['experience']))

    if averageSalaryByYoeMap[exp]:
        return averageSalaryByYoeMap[exp]
    else:
        return avgSal

Average salary: 141817.4728727394


In [334]:
def baseline1():
    # Returns average salary in dataset
    y_b1 = []
    y_b1_pred = []
    
    avg_err = 0
    avg_percent_err = 0
    
    for d in X_valid:
        pred = pred_baseline1(d)
        actual = d['salary']
        
        y_b1_pred.append(pred)
        y_b1.append(actual)
        
        avg_err += abs(actual - pred)
        avg_percent_err += abs(100 * ((pred - actual) / actual))
        
    mse = MSE(y_b1_pred, y_b1)
    
    avg_err = avg_err / len(X_valid)
    avg_percent_err = avg_percent_err / len(X_valid)
    
    return mse, avg_err, avg_percent_err

In [335]:
mse_b1, avg_err_b1, avg_percent_err_b1 = baseline1()
print(mse_b1)
print(avg_err_b1)
print(avg_percent_err_b1)

3226865779.940545
39228.80793670302
45.890112992112925


In [203]:
def baseline2():
    # Takes rounded YOE for each datapoint and predicts average salary for that YOE
    # If there are no entries for that YOE, predict overall average
    y_b1 = []
    y_b1_pred = []
    
    avg_err = 0
    avg_percent_err = 0
    
    for d in X_valid:
        pred = pred_baseline2(d)
        actual = d['salary']
        
        y_b1_pred.append(pred)
        y_b1.append(actual)
        
        avg_err += abs(actual - pred)
        avg_percent_err += abs(100 * ((pred - actual) / actual))
        
    mse = MSE(y_b1_pred, y_b1)
    
    avg_err = avg_err / len(X_valid)
    avg_percent_err = avg_percent_err / len(X_valid)
    
    return mse, avg_err, avg_percent_err

In [204]:
mse_b2, avg_err_b2, avg_percent_err_b2 = baseline2()
print(mse_b2)
print(avg_err_b2)
print(avg_percent_err_b2)

2653897798.2106194
34915.2403546613
40.5771376588154


In [16]:
#####
# Model
#####

In [17]:
#####
# Provide a description of your model:
#      1. What model are you using
#      2. What information are you trying to incorporate into your model
#      3. How are you building your feature vector?
#      4. What interesting pieces of information did you learn? (i.e. most influential words, popular cities, etc)
#####

# Write description in comments here

In [18]:
# Model code starts here

In [None]:
# Simple sliding window

In [205]:
combined = []
for x,y in zip(X_train, y_train):
    combined.append((x['timestamp'], y))

combined.sort()
print(combined[:10])
d = combined[len(combined)-1][0]
print(datetime.datetime.fromtimestamp(d))

[(1496860407, 107000), (1497218037, 155000), (1497500545, 169000), (1497635041, 120000), (1497684194, 157000), (1498009799, 110000), (1498160271, 180000), (1498164926, 135000), (1498192788, 165000), (1498537545, 157000)]
2021-08-17 08:28:57


In [206]:
X_train_time = [d[0] for d in combined]
y_train_time = [d[1] for d in combined]

In [207]:
wSize = 10
ySum = sum(y_train_time[:wSize])
sliding = []

print(ySum)

sliding.append(avgSal)

for i in range(1, wSize):

    prev = y_train_time[i-1]
    sliding.append(avgSal)

for i in range(wSize,len(y_train_time)):
    ySum += y_train_time[i] - y_train_time[i-wSize]
    sliding.append(ySum*1.0/wSize)


print(len(sliding))
print(len(y_train_time))
print(y_train_time[2])
print(sliding[3])

1455000
47996
47996
169000
141817.4728727394


In [208]:
print(sliding[0])
print(y_train_time[0])

141817.4728727394
107000


In [209]:
mse = MSE(sliding, y_train_time)

avg_err = 0
avg_percent_err = 0

mae = sklearn.metrics.mean_absolute_error(y_train_time, sliding)
mape = 100*sklearn.metrics.mean_absolute_percentage_error(y_train_time, sliding)


In [210]:
print(mse)
print(mae)
print(mape)

2729628482.5204024
37655.46004780095
43.35101220648099


In [None]:
# linear regression

In [336]:
mod = linear_model.LinearRegression()

X_train_time = [[t] for t in sliding]

mod.fit(X_train_time, y_train_time)

pred = mod.predict(X_train_time)

mse = MSE(pred, y_train_time)
mae = sklearn.metrics.mean_absolute_error(y_train_time, pred)
mape = 100*sklearn.metrics.mean_absolute_percentage_error(y_train_time, pred)

In [337]:
print(mse)
print(mae)
print(mape)

3132403061.9176316
39665.50381308346
48.06534719228379


In [None]:
# One-hot encodings

In [338]:
minYear = 2017
maxYear = 2021

def feature_time(d):
    
    unix_time = d['timestamp']
    date = datetime.datetime.fromtimestamp(unix_time)

    year = date.year
    month = date.month
    day = date.day
    hour = date.hour
    minute = date.minute

    one_hot_year = [0]*(maxYear-minYear)
    y_index = year - minYear

    if y_index != 0:
        one_hot_year[y_index-1] = 1

    one_hot_month = [0]*11
    m_index = month - 1

    if m_index != 0:
        one_hot_month[m_index-1] = 1

    one_hot_day = [0]*31
    d_index = day - 1

    if d_index != 0:
        one_hot_day[d_index-1] = 1

    one_hot_hour = [0]*23
    h_index = hour - 1

    if h_index != 0:
        one_hot_hour[h_index-1] = 1

    return \
        one_hot_year \
        + one_hot_month \
        + one_hot_day \
        + one_hot_hour

In [339]:
mod = linear_model.LinearRegression()

# mod = linear_model.Ridge(0.5)

X_train_time = [feature_time(d) for d in X_train]
y_train_time = y_train[:]

mod.fit(X_train_time, y_train_time)

# pred = mod.predict(X_train_time)

In [323]:
# mse = sklearn.metrics.mean_squared_error(y_train_time, pred)
# mae = sklearn.metrics.mean_absolute_error(y_train_time, pred)
# mape = 100*sklearn.metrics.mean_absolute_percentage_error(y_train_time, pred)

# print(mse)
# print(mae)
# print(mape)

In [340]:
X_valid_time = [feature_time(d) for d in X_valid]
y_valid_time = y_valid[:]

pred = mod.predict(X_valid_time)

In [341]:
mse = sklearn.metrics.mean_squared_error(y_valid_time, pred)
mae = sklearn.metrics.mean_absolute_error(y_valid_time, pred)
mape = 100*sklearn.metrics.mean_absolute_percentage_error(y_valid_time, pred)

print(mse)
print(mae)
print(mape)

3146043983.2483754
39045.645395399115
44.17728667869956


In [330]:
X_test_time = [feature_time(d) for d in X_test]
y_test_time = y_test[:]

pred = mod.predict(X_test_time)

In [331]:
mse = sklearn.metrics.mean_squared_error(y_test_time, pred)
mae = sklearn.metrics.mean_absolute_error(y_test_time, pred)
mape = 100*sklearn.metrics.mean_absolute_percentage_error(y_test_time, pred)

print(mse)
print(mae)
print(mape)

3344115848.3050284
39317.06383225119
46.452416555328405
