In [1]:
import gzip
from collections import defaultdict
import sklearn
from sklearn import linear_model
import string
import re
import datetime, time
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as md
import math
from gensim.models import Word2Vec


In [2]:
#####
# Setup
#####

In [3]:
def readCSV(path):
    f = open(path, 'rt')
    f.readline()

    for l in f:
        
        if l.startswith("timestamp"): 
            continue

        try: 
            timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education = re.split(r',(?![ ])', l)

        except:
            continue

        yield timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education

In [4]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [5]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0

In [6]:
#####
# Parse data from dataset
#####

In [7]:
# id , unixtime, company, level, title, total_comp, city, state, experience, tenure, tag, 
# salary, stock, bonus, city_id, dma_id, ms_deg, bs_deg, phd_deg, hs, college
def parseSalaryRaw(salaryRaw):
    salaryAllData = []

    for entry in salaryRaw:

        date, clocktime = entry[0].split(' ')
        date = date.split('/')
        clocktime = clocktime.split(':')
        timestamp = datetime.datetime(int(date[2]), int(date[0]), int(date[1]), int(clocktime[0]), int(clocktime[1]), int(clocktime[2]))
        timestamp = int(time.mktime(timestamp.timetuple()))


        company = entry[1]
        level = entry[2]
        title = entry[3]

        total_comp = int(float(entry[4]))

        location = entry[5]
        if location.count(',') == 2:
            city, state, country = location.strip('"').split(', ')
        elif location.count(',') == 1:
            city, state = location.strip('"').split(', ')
            country = 'USA'

        experience = entry[6]
        tenure = entry[7]

        tag = entry[8]

        salary = int(float(entry[9]))
        if salary == 0:
            continue

        stock = int(float(entry[10]))
        bonus = int(float(entry[11]))

        g = entry[12]
        gender = -1
        if g == 'Female':
            gender = 0
        elif g == 'Male':
            gender = 1
        elif g == 'Other':
            gender = 2

        city_id = int(entry[14])

        try: 
            dma_id = int(entry[15])
        except:
            dma_id = -1

        id = int(entry[16])

        ms_deg = int(entry[17])
        bs_deg = int(entry[18])
        phd_deg = int(entry[18])
        hs = int(entry[19])
        college = int(entry[20])

        r = entry[27]
        race = -1
        if r == 'Asian':
            race = 0
        elif r == 'Black':
            race = 1
        elif r == 'Hispanic':
            race = 2
        elif r == 'Two Or More':
            race = 3
        elif r == 'White':
            race = 4


        salaryAllData.append({
            'id': id,
            'timestamp': timestamp,
            'company': company,
            'level': level,
            'title': title,
            'total_comp': total_comp,
            'city': city,
            'state': state,
            'country': country,
            'experience': experience,
            'tenure': tenure,
            'tag': tag,
            'salary': salary,
            'stock': stock,
            'bonus': bonus,
            'gender': gender,
            'city_id': city_id,
            'dma_id': dma_id,
            'ms_deg': ms_deg,
            'bs_deg': bs_deg,
            'phd_deg': phd_deg,
            'hs': hs,
            'college': college,
            'race': race,
        })
    return salaryAllData


In [8]:
# Reparse data from CSV so we don't mess up above data
salaryRaw = []
for l in readCSV("Levels_Fyi_Salary_Data.csv"):
    salaryRaw.append(l)

salaryAllData = parseSalaryRaw(salaryRaw)
salaryAllData[0]['id']

1

In [9]:
# Parse raw data
salaryAllData = parseSalaryRaw(salaryRaw)

In [10]:
#####
# Training, Validation, and Test sets
#####

# shuffle data
shuffledAllData = sklearn.utils.shuffle(salaryAllData)
shuffledAllData[0]['id']

# create 80 / 10 / 10 : Train / Validation / Test sets
X = shuffledAllData
y = [d['salary'] for d in shuffledAllData]

X_train, X_temp, y_train, y_temp = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
X_valid, X_test, y_valid, y_test = sklearn.model_selection.train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

# Validate that sets were created with propper sizes
print(str(len(X_train)) + ", " + str(len(y_train)))
print(str(len(X_valid)) + ", " + str(len(y_valid)))
print(str(len(X_test)) + ", " + str(len(y_test)))

# Validate all ids are different
print(str(X_train[0]['id']) + ", " + str(X_valid[0]['id']) + ", " + str(X_test[0]['id']))

47996, 47996
6000, 6000
6000, 6000
58955, 61390, 58283


In [11]:
#####
# Baselines
#   1. Predict average salary in dataset
#   2. Predict average salary for given Years of Experience
#####

In [12]:
# Create some useful collections for these baseline

# create YOE to Average Map based on training data
# We create two dictionaries to track the total salary seen at each YOE
salByExpMap = defaultdict(int)

# And the number of salaries seen at that YOE
entriesByExpMap = defaultdict(int)

# For each entry increment the total sal and num entries at that YOE
# (This could be done with a running avg technique but that is too much work to code)
for x in X_train:
    exp, sal = float(x['experience']), float(x['salary'])
    exp = math.ceil(exp)
    
    total = 0
    numEntries = 0
    
    if salByExpMap[exp]:
        total = salByExpMap[exp]
    if entriesByExpMap[exp]:
        numEntries = entriesByExpMap[exp]
    
    total += sal
    numEntries += 1
    
    salByExpMap[exp] = total
    entriesByExpMap[exp] = numEntries
    

    
# Calculate averages by YOE as well as total avg
totalSal = 0
totalCount = 0

averageSalaryByYoeMap = defaultdict(int)
for exp in entriesByExpMap:
    avg = salByExpMap[exp] / entriesByExpMap[exp]
    averageSalaryByYoeMap[exp] = avg
    
    totalSal += salByExpMap[exp]
    totalCount += entriesByExpMap[exp]

avgSal = totalSal / totalCount

print("Average salary: " + str(avgSal))


def pred_baseline1(x):    
    return avgSal
    
def pred_baseline2(x):    
    exp = math.ceil(float(x['experience']))

    if averageSalaryByYoeMap[exp]:
        return averageSalaryByYoeMap[exp]
    else:
        return avgSal

Average salary: 142030.40745062087


In [13]:
def baseline1():
    # Returns average salary in dataset
    y_b1 = []
    y_b1_pred = []
    
    avg_err = 0
    avg_percent_err = 0
    
    for d in X_valid:
        pred = pred_baseline1(d)
        actual = d['salary']
        
        y_b1_pred.append(pred)
        y_b1.append(actual)
        
        avg_err += abs(actual - pred)
        avg_percent_err += abs(100 * ((actual - pred) / actual))
        
    mse = MSE(y_b1_pred, y_b1)
    
    avg_err = avg_err / len(X_valid)
    avg_percent_err = avg_percent_err / len(X_valid)
    
    return mse, avg_err, avg_percent_err

In [14]:
mse_b1, avg_err_b1, avg_percent_err_b1 = baseline1()
print(mse_b1)
print(avg_err_b1)
print(avg_percent_err_b1)

2841626734.431603
39006.63061253708
47.67888089470349


In [15]:
def baseline2():
    # Takes rounded YOE for each datapoint and predicts average salary for that YOE
    # If there are no entries for that YOE, predict overall average
    y_b1 = []
    y_b1_pred = []
    
    avg_err = 0
    avg_percent_err = 0
    
    for d in X_valid:
        pred = pred_baseline2(d)
        actual = d['salary']
        
        y_b1_pred.append(pred)
        y_b1.append(actual)
        
        avg_err += abs(actual - pred)
        avg_percent_err += abs(100 * ((actual - pred) / actual))
        
    mse = MSE(y_b1_pred, y_b1)
    
    avg_err = avg_err / len(X_valid)
    avg_percent_err = avg_percent_err / len(X_valid)
    
    return mse, avg_err, avg_percent_err

In [16]:
mse_b2, avg_err_b2, avg_percent_err_b2 = baseline2()
print(mse_b2)
print(avg_err_b2)
print(avg_percent_err_b2)

2233284281.095253
33758.676792830134
41.283423512399544


In [17]:
#####
# Model
#####

In [18]:
#####
# Provide a description of your model:
#      1. What model are you using
#      2. What information are you trying to incorporate into your model
#      3. How are you building your feature vector?
#      4. What interesting pieces of information did you learn? (i.e. most influential words, popular cities, etc)
#####

# Write description in comments here

In [19]:
# Model code starts here

In [28]:
def feature(d, words, wordId):
    feat = [0]*len(words)
    r = d['company'] + ' ' + d['level'] + ' ' + d['title'] + ' ' + d['tag']
    r = r.lower()
#     r = ''.join([c for c in r if not c in punctuation])
    
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    feat.append(float(d['experience'])) # offset
    return feat

In [30]:
# Inspired by and midified from workbook chapter 8

dSize = 500
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in X_train:
    r = d['company'] + ' ' + d['level'] + ' ' + d['title'] + ' ' + d['tag']
    r = r.lower()
#     r = ''.join([c for c in r if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

        
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:dSize]]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
# print(words)
X_working = [feature(d, words, wordId) for d in X_train]
y_working = [d['salary'] for d in X_train]

In [22]:
# Tried a LinearRegression model, but it didn't work as well as a ridge regression model

model = sklearn.linear_model.LinearRegression(fit_intercept=False)
model.fit(X_working, y_working)

predictTrain = model.predict(X_working)
MSEtrain = sum((y_working - predictTrain)**2)/len(y_working)

Xvalid_working = [feature(d, words, wordId) for d in X_valid]
predictValid = model.predict(Xvalid_working)
MSEvalid = sum((y_valid - predictValid)**2)/len(y_valid)

print("linear model validation MSE = " + str(MSEvalid))


X_test_working = [feature(d, words, wordId) for d in X_test]
predictTest = bestModel.predict(X_test_working)
MSEtest = sum((y_test - predictTest)**2)/len(y_test)
MSEtest

linear model validation MSE = 1432151375.0863342


NameError: name 'bestModel' is not defined

In [31]:
ls = [0.5, 1]
# 0.5 produces best MSE
# 1 produces lowest average percent error
errorTrain = []
errorValid = []
bestModel = None
bestVal = None
bestLamb = None

    
for l in ls:
    model = sklearn.linear_model.Ridge(l, fit_intercept=False)
    model.fit(X_working, y_working)

    predictTrain = model.predict(X_working)
    MSEtrain = sum((y_working - predictTrain)**2)/len(y_working)
    errorTrain.append(MSEtrain)
    
    Xvalid_working = [feature(d, words, wordId) for d in X_valid]
    predictValid = model.predict(Xvalid_working)
    MSEvalid = sum((y_valid - predictValid)**2)/len(y_valid)
    errorValid.append(MSEvalid)
    print("l = " + str(l) + ", validation MSE = " + str(MSEvalid))
    
    
    
    avg_err = 0
    avg_percent_err = 0
    for (x, pred) in zip(X_valid, predictValid):
        actual = x['salary']
        
        avg_err += abs(actual - pred)
        avg_percent_err += abs(100 * ((actual - pred) / actual))
    avg_err = avg_err / len(X_valid)
    avg_percent_err = avg_percent_err / len(X_valid)
    print("Average error: " + str(avg_err) + ", Average percent error: " + str(avg_percent_err))



    if bestVal == None or MSEvalid < bestVal:
        bestVal = MSEvalid
        bestModel = model
        bestLamb = l

X_test_working = [feature(d, words, wordId) for d in X_test]
predictTest = bestModel.predict(X_test_working)
MSEtest = sum((y_test - predictTest)**2)/len(y_test)
MSEtest

theta = bestModel.coef_
wordSort = list(zip(theta[:-1], words))
wordSort.sort(reverse=True)
print()
print("Best:")
print("best val=" + str(bestVal) + " with l=" + str(bestLamb))
print("Most influential words:")
print(wordSort[:50])


l = 0.5, validation MSE = 1313085520.784055
Average error: 25324.33171547305, Average percent error: 31.607352479484902
l = 1, validation MSE = 1312912989.9455817
Average error: 25320.907178514626, Average percent error: 31.616863970133533

Best:
best val=1312912989.9455817 with l=1
Most influential words:
[(306161.44883896323, 'netflix'), (102046.86308084863, 'roku'), (86741.98403097567, 'karma'), (85886.28961749122, 'roblox'), (75623.16965161484, 'robinhood'), (66901.96556118508, 'squarespace'), (66483.28002743432, 'capital'), (65243.74205454805, 'director'), (63461.12964862867, 'vp'), (62869.55913322759, 'citadel'), (62511.82104873587, 'stripe'), (61868.663904953995, 'dropbox'), (61069.65154606778, 'cruise'), (58189.27554658497, 'e7'), (55834.85920703262, 'box'), (53987.14787999883, 'zoox'), (53207.14587355262, 'bloomberg'), (50833.10803056809, 'wish'), (50746.067337241504, 'airbnb'), (50348.76136262441, 'ic6'), (50221.24562077217, 'm5'), (50171.682840248155, 'waymo'), (49831.103175

In [78]:
# Attempting word2vec
reviewTokens = []
for d in X_train:
    r = d['company'] + ' ' + d['level'] + ' ' + d['title'] + ' ' + d['tag']
    r = ''.join([c for c in r.lower()])
    
    tokens = []
    for w in r.split():
        tokens.append(w)
    reviewTokens.append(tokens)

In [88]:
model = Word2Vec(reviewTokens,
                 min_count=1, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1) # Skip-gram model

In [89]:
model.wv.similar_by_word("software")


[('engineer', 0.9774618148803711),
 ('systems', 0.970441460609436),
 ('distributed', 0.968224823474884),
 ('(back-end)', 0.9662066698074341),
 ('na', 0.9565079808235168),
 ('ios', 0.9301960468292236),
 ('stack', 0.9286606311798096),
 ('full', 0.9263443946838379),
 ('android', 0.9213243126869202),
 ('operating', 0.890781581401825)]

In [112]:
# Attempt TF-IDF
# Conclusion: TFIDF is probably good, but takes FOREVER with the whole dataset. 
# When comparing 2000 entries to each in the validation set, it was taking ~18 seconds per 100.
# When comparing all 50000 entries in the training set, it was taking ~4s per (adding up to around 6 hrs)
# When comparing 15000, we get mse of 3573836187.9486804

df = defaultdict(int)
for d in X_train:
    text = d['company'] + ' ' + d['level'] + ' ' + d['title'] + ' ' + d['tag']
    r = ''.join([c for c in text.lower()])
    for w in set(r.split()):
        df[w] += 1

d = X_train[2]
text = d['company'] + ' ' + d['level'] + ' ' + d['title'] + ' ' + d['tag']
tf = defaultdict(int)
r = ''.join([c for c in text.lower() if not c in punctuation])
for w in r.split():
    # Note = rather than +=, different versions of tf could be used instead
    tf[w] += 1
    
tfidf = dict(zip(words,[tf[w] * math.log2(len(X_train) / df[w]) for w in words]))
tfidfQuery = [tf[w] * math.log2(len(X_train) / df[w]) for w in words]


# Find the other reviews in the corpus with the highest cosine similarity between tf-idf vectors

similarities = []
ind = 0
for rev2 in X_train:
    tf = defaultdict(int)
    text = rev2['company'] + ' ' + rev2['level'] + ' ' + rev2['title'] + ' ' + rev2['tag']
    r = ''.join([c for c in text.lower() if not c in punctuation])
    for w in r.split():
        # Note = rather than +=
        tf[w] = 1
    tfidf2 = [tf[w] * math.log2(len(X_train) / df[w]) for w in words]
    similarities.append((Cosine(tfidfQuery, tfidf2), text, ind))
    ind += 1
    
similarities.sort(reverse=True)
similarities[:10]

sums = 0
c = 0
for s in similarities:
    if s[0] > 0.75:
        sal = X_train[s[2]]['salary']
        sums += sal
        c += 1
        
print(sums / c)

217311.47540983607


In [113]:
X_train[2] 

{'id': 63508,
 'timestamp': 1617148779,
 'company': 'MobileIron',
 'level': 'M1',
 'title': 'Software Engineering Manager',
 'total_comp': 240000,
 'city': 'San Francisco',
 'state': 'CA',
 'country': 'USA',
 'experience': '12',
 'tenure': '4',
 'tag': 'Full Stack',
 'salary': 200000,
 'stock': 10000,
 'bonus': 30000,
 'gender': -1,
 'city_id': 7419,
 'dma_id': 807,
 'ms_deg': 0,
 'bs_deg': 0,
 'phd_deg': 0,
 'hs': 0,
 'college': 0,
 'race': -1}

In [172]:
def buildTFIDFVec(datum):
    text = d['company'] + ' ' + d['level'] + ' ' + d['title'] + ' ' + d['tag']
    tf = defaultdict(int)
    r = ''.join([c for c in text.lower() if not c in punctuation])
    for w in r.split():
        # Note = rather than +=, different versions of tf could be used instead
        tf[w] += 1

    tfidf = dict(zip(words,[tf[w] * math.log2(len(X_train) / df[w]) for w in words]))
    tfidfQuery = [tf[w] * math.log2(len(X_train) / df[w]) for w in words]

    return tfidfQuery

def predictTfifdSalary(datum, tfidfMap):
    tfidfQuery = tfidfMap[datum['id']]

    similarities = []
    ind = 0
    for rev2 in X_train[:15000]:
        tfidf2 = tfidfMap[rev2['id']]
        
        cos = (Cosine(tfidfQuery, tfidf2), text, ind)
        if cos[0] > 0.95:
            similarities.append(cos)
            ind += 1
    
    sums = 0
    c = 0
    for s in similarities:
        sal = X_train[s[2]]['salary']
        sums += sal
        c += 1
            
    if sums == 0:
        return avgSal
    else:
        return sums / c
    print(sums / c)

In [170]:
df = defaultdict(int)
for d in X_train:
    text = d['company'] + ' ' + d['level'] + ' ' + d['title'] + ' ' + d['tag']
    r = ''.join([c for c in text.lower()])
    for w in set(r.split()):
        df[w] += 1
        
tfidfMap = defaultdict(int)
for d in X_train + X_valid:
    tfidfMap[d['id']] = buildTFIDFVec(d)

In [173]:
y_pred_tfidf = []
y_actual = [x['salary'] for x in X_valid]
i = 0
for x in X_valid:
    pred_sal = predictTfifdSalary(x, tfidfMap)
    y_pred_tfidf.append(pred_sal)
    
    if i % 10 == 0:
        print(str(i) + " / " + str(len(X_valid)))
    
        workingMSE = MSE(y_actual[:(i+1)], y_pred_tfidf)
        print("Working MSE: " + str(workingMSE))
        
    i += 1

    

0 / 6000
Working MSE: 400000000.0
10 / 6000
Working MSE: 1017927522.8150231
20 / 6000
Working MSE: 1042732468.8624872
30 / 6000
Working MSE: 1164618684.6428666
40 / 6000
Working MSE: 1309916100.784965
50 / 6000
Working MSE: 1738763427.062622
60 / 6000
Working MSE: 1749901218.6467779
70 / 6000
Working MSE: 1675389025.5833533
80 / 6000
Working MSE: 1574867231.4293556
90 / 6000
Working MSE: 1483073722.086586
100 / 6000
Working MSE: 1783372087.5695639
110 / 6000
Working MSE: 1700908287.5804355
120 / 6000
Working MSE: 1769105811.6247447
130 / 6000
Working MSE: 1748963032.2738
140 / 6000
Working MSE: 1691772282.21718
150 / 6000
Working MSE: 1629102903.8358355
160 / 6000
Working MSE: 1659685232.6863365
170 / 6000
Working MSE: 1710228510.8699613
180 / 6000
Working MSE: 1717082164.3146453
190 / 6000
Working MSE: 1773315527.7024567
200 / 6000
Working MSE: 1907473124.3497589
210 / 6000
Working MSE: 1967920985.7890782
220 / 6000
Working MSE: 2031678332.4832916
230 / 6000
Working MSE: 2026828531.12

1920 / 6000
Working MSE: 2781200379.360616
1930 / 6000
Working MSE: 2780127951.203573
1940 / 6000
Working MSE: 2788871327.586332
1950 / 6000
Working MSE: 2851655679.8418236
1960 / 6000
Working MSE: 2846848079.436855
1970 / 6000
Working MSE: 2854709918.4477944
1980 / 6000
Working MSE: 2879493755.510685
1990 / 6000
Working MSE: 2885702503.672029
2000 / 6000
Working MSE: 2885245490.588435
2010 / 6000
Working MSE: 2879722579.526359
2020 / 6000
Working MSE: 2873506397.012874
2030 / 6000
Working MSE: 2868792137.054128
2040 / 6000
Working MSE: 2861249479.950831
2050 / 6000
Working MSE: 2854978809.2398562
2060 / 6000
Working MSE: 2855227774.4738317
2070 / 6000
Working MSE: 2852324287.8300395
2080 / 6000
Working MSE: 2845933721.313992
2090 / 6000
Working MSE: 2844571581.465104
2100 / 6000
Working MSE: 2837346342.7599154
2110 / 6000
Working MSE: 2839406767.5097227
2120 / 6000
Working MSE: 2834487396.103266
2130 / 6000
Working MSE: 2825294874.506495
2140 / 6000
Working MSE: 2827376233.3072634
215

3810 / 6000
Working MSE: 3184293170.27933
3820 / 6000
Working MSE: 3184240961.8349795
3830 / 6000
Working MSE: 3186243039.1464868
3840 / 6000
Working MSE: 3182896508.336588
3850 / 6000
Working MSE: 3179492169.2259173
3860 / 6000
Working MSE: 3179353706.230975
3870 / 6000
Working MSE: 3183357979.0315166
3880 / 6000
Working MSE: 3181094231.0233445
3890 / 6000
Working MSE: 3178876059.9183493
3900 / 6000
Working MSE: 3172453790.4836535
3910 / 6000
Working MSE: 3169791294.3299866
3920 / 6000
Working MSE: 3181786082.866825
3930 / 6000
Working MSE: 3180205601.8352532
3940 / 6000
Working MSE: 3175709715.138531
3950 / 6000
Working MSE: 3169616062.1322556
3960 / 6000
Working MSE: 3163390320.6583247
3970 / 6000
Working MSE: 3157755846.4203486
3980 / 6000
Working MSE: 3153268397.5372057
3990 / 6000
Working MSE: 3148886875.8999987
4000 / 6000
Working MSE: 3146174631.353369
4010 / 6000
Working MSE: 3150036926.3674626
4020 / 6000
Working MSE: 3149608427.836706
4030 / 6000
Working MSE: 3147482895.8597

5700 / 6000
Working MSE: 3581105283.0977955
5710 / 6000
Working MSE: 3577665694.800943
5720 / 6000
Working MSE: 3574998160.0093126
5730 / 6000
Working MSE: 3574384511.2983527
5740 / 6000
Working MSE: 3569715593.366729
5750 / 6000
Working MSE: 3606158594.7861176
5760 / 6000
Working MSE: 3600690436.083003
5770 / 6000
Working MSE: 3596335155.974835
5780 / 6000
Working MSE: 3592594175.3140225
5790 / 6000
Working MSE: 3587655905.6874285
5800 / 6000
Working MSE: 3587171910.083379
5810 / 6000
Working MSE: 3602149278.1836123
5820 / 6000
Working MSE: 3600545368.877793
5830 / 6000
Working MSE: 3599318249.5218363
5840 / 6000
Working MSE: 3596796046.8621974
5850 / 6000
Working MSE: 3597597888.833772
5860 / 6000
Working MSE: 3594123069.983944
5870 / 6000
Working MSE: 3591578850.153561
5880 / 6000
Working MSE: 3587371142.112558
5890 / 6000
Working MSE: 3587633552.339878
5900 / 6000
Working MSE: 3583186109.9010596
5910 / 6000
Working MSE: 3582502096.3254147
5920 / 6000
Working MSE: 3584059259.8472075

In [174]:
    
mse_tfidf = MSE(y_actual, y_pred_tfidf)

print(mse_tfidf)

3573836187.9486804


<zip at 0x7fc808e78e00>

In [148]:
for (a, p) in zip(y_actual, y_pred_tfidf):
    print(str(a) + ", " + str(p))

130000, 141942.59186182183
144500, 160000.0
95000, 141942.59186182183
180000, 154500.0
137000, 158500.0
132000, 139500.0
111000, 141942.59186182183
145000, 141942.59186182183
140000, 141942.59186182183
200000, 150200.0
140000, 160000.0
125000, 120000.0
130000, 141942.59186182183
145000, 141942.59186182183
201000, 250000.0
126000, 128000.0
168000, 150000.0
140000, 145600.0
160000, 133800.0
189000, 122000.0
160000, 90000.0
152000, 140166.66666666666
175000, 155000.0
73000, 141942.59186182183
215000, 141942.59186182183
189000, 192812.5
120000, 180000.0
165000, 130863.63636363637
130000, 102500.0
210000, 152500.0
144000, 141942.59186182183
88000, 141942.59186182183
58000, 27000.0
90000, 103333.33333333333
170000, 142882.35294117648
165000, 141942.59186182183
173000, 235000.0
150000, 141942.59186182183
108000, 97000.0
82000, 136807.86026200873
104000, 119439.0243902439
90000, 141942.59186182183
100000, 102000.0
165000, 127500.0
120000, 75000.0
140000, 106166.66666666667
139000, 115500.0
135