In [1]:
predictors_to_use = [
        [ 'In Death Zone', lambda r: 1 if abs(r['Latitude']) < 36 else 0 ],
        [ 'Sunlight', lambda r: math.cos((r['Longitude']-50) * math.pi / 180) ], # sunlight?
        [ 'Hi Pi 1', lambda r: max(0, r['Pi'] - math.pi) ], # pi too high
        [ 'Hi Pi 2', lambda r: max(0, r['Pi'] - 3.15) ], # pi too high
        [ 'Low Pi', lambda r: max(0, math.pi - r['Pi']) ], # pi too low
        [ 'Extreme Smell', lambda r: 1 if r['Smell_Level'] == 2 else 0 ], # EXTREME smell
        [ 'No Smell', lambda r: 1 if r['Smell_Level'] == 0 else 0 ], # no smell
        [ 'Excellent Shui', lambda r: 1 if r['Feng_Shui_Quality'] == 2 else 0 ], # good shui
        [ 'Bad Shui', lambda r: 1 if r['Feng_Shui_Quality'] == 0 else 0 ], # bad shui
        [ 'Apple', lambda r: r['Air_Apple'] ],
        [ 'Burning', lambda r: r['Air_Burn'] ],
        [ 'Copper', lambda r: r['Air_Copper'] ],
        [ 'Mint', lambda r: r['Air_Mint'] ],
        [ 'Humming', lambda r: r['Sound_Humming'] ],
        [ 'Squelching', lambda r: r['Sound_Squelching'] ],
        [ 'Buzzing', lambda r: r['Sound_Buzzing'] ],
        [ 'Skittering', lambda r: r['Sound_Skittering'] ],  
        [ 'Murphy-Linear', lambda r: r['Murphy'] ], 
        [ 'Murphy-Square', lambda r: r['Murphy'] ** 2 ],  
        [ 'Murphy-2^N', lambda r: 2**r['Murphy'] ],  
        [ 'Murphy-3^N', lambda r: 3**r['Murphy'] ],  
        [ 'Murphy-4^N', lambda r: 4**r['Murphy'] ], 
    ]

In [2]:

import math

f = 'C:\\dndsci_zppg_formatted.csv'

file = open(f)

col_name_line = file.readline()
col_names = col_name_line.split(',')
col_names = [x.replace('\n','') for x in col_names]
print(col_names)

rows = []

while True:
    line = file.readline()
    if len(line) < 1:
        break

    vals = line.split(',')
        
    vals = [x.replace('\n','') for x in vals]
    vals = [x for x in vals if len(x)]
    row_struct = {}
    for i in range(len(vals)):
        row_struct[col_names[i]] = float(vals[i])
    rows.append(row_struct)
    
print('Loaded {} rows\n'.format(len(rows)))

def log_row(log_row, mode='a'):
    log_string = ','.join([str(e) for e in log_row])+"\n"
    f = open('dndsci_zppg_score.csv', mode)
    f.write(log_string)

['Longitude', 'Latitude', 'Shortitude', 'Deltitude', 'Pi', 'Murphy', 'Smell_Level', 'Feng_Shui_Quality', 'Air_Apple', 'Air_Burn', 'Air_Copper', 'Air_Mint', 'Sound_Humming', 'Sound_Skittering', 'Sound_Squelching', 'Sound_Buzzing', 'Performance']
Loaded 10407 rows



In [3]:
import numpy as np

predictors = []
scores = []

for r in rows:
    predictors.append([p[1](r) for p in predictors_to_use])
    scores.append([math.log(r['Performance'],10)*-1])

predictors = np.array(np.matrix(predictors))
scores = np.array(np.matrix(scores))


In [4]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(np.asarray(predictors), np.asarray(scores))

# display coefficients
print(regressor.coef_)
print(regressor.intercept_)

[[ 2.06735036e-01 -5.27305434e-02 -4.11474563e+00  8.96268940e+00
   5.75635613e+00 -9.55218834e-02 -1.20815532e-01 -2.20653645e-02
   7.50532882e-02 -1.43966236e-01 -2.98200793e-01 -2.44789564e-01
  -2.79766845e-01  6.33593097e-01  3.01202086e-01  1.25874636e-01
   1.73229850e-02 -2.09347596e-02 -2.44578121e-02  7.79440733e-02
  -1.04544565e-02  1.11593018e-03]]
[0.3575762]


In [5]:
for i in range(len(predictors_to_use)):
    print('{} : {}'.format(predictors_to_use[i][0], regressor.coef_[0][i]))

In Death Zone : 0.2067350364928715
Sunlight : -0.052730543411768305
Hi Pi 1 : -4.114745626642051
Hi Pi 2 : 8.962689400353522
Low Pi : 5.756356131255769
Extreme Smell : -0.09552188338412596
No Smell : -0.1208155320309348
Excellent Shui : -0.02206536448091116
Bad Shui : 0.0750532882477745
Apple : -0.14396623550258927
Burning : -0.2982007933940471
Copper : -0.24478956406742985
Mint : -0.2797668454177703
Humming : 0.6335930965019667
Squelching : 0.30120208578549734
Buzzing : 0.12587463622379025
Skittering : 0.0173229850179329
Murphy-Linear : -0.020934759603605683
Murphy-Square : -0.02445781210524455
Murphy-2^N : 0.07794407327664456
Murphy-3^N : -0.01045445650523344
Murphy-4^N : 0.0011159301815777192


In [6]:
penalty_diffs = []
score_diffs = []
log_row(col_names + ['Actual_Penalty', 'Predicted_Penalty', 'Predicted Score', 'Penalty Diff', 'Score Diff'], mode='w')
for i in range(len(rows)):
    row = rows[i]
    row_to_log = []
    for col in col_names:
        row_to_log.append(row[col])
    
    actual_penalty = scores[i][0]
    row_to_log.append(actual_penalty)
    pred_penalty = regressor.intercept_[0]
    for j in range(len(predictors[i])):
        pred_penalty = pred_penalty + (predictors[i][j] * regressor.coef_[0][j])
    row_to_log.append(pred_penalty)
    pred_score = 10**(-1*pred_penalty)
    row_to_log.append(pred_score)
    penalty_diff = actual_penalty - pred_penalty
    row_to_log.append(penalty_diff)
    penalty_diffs.append(penalty_diff)
    score_diff = row['Performance'] - pred_score
    row_to_log.append(score_diff)
    score_diffs.append(score_diff)
    log_row(row_to_log)
    
print('Unexplained score diffs range from {:.2f}% to {:.2f}%'.format(min(score_diffs)*100, max(score_diffs)*100))
        

Unexplained score diffs range from -4.74% to 3.04%


In [7]:
print(scores[i])

[1.08618615]


In [8]:
f = 'dndsci_zppg__sites_formatted.csv'

file = open(f)

col_name_line = file.readline()
col_names = col_name_line.split(',')
col_names = [x.replace('\n','') for x in col_names]
print(col_names)

rows = []

while True:
    line = file.readline()
    if len(line) < 1:
        break

    vals = line.split(',')
        
    vals = [x.replace('\n','') for x in vals]
    vals = [x for x in vals if len(x)]
    row_struct = {}
    for i in range(len(vals)):
        row_struct[col_names[i]] = float(vals[i])
    rows.append(row_struct)
    
print('Loaded {} rows\n'.format(len(rows)))

def log_row(log_row, mode='a'):
    log_string = ','.join([str(e) for e in log_row])+"\n"
    f = open('dndsci_zppg_sites_score.csv', mode)
    f.write(log_string)

log_row(col_names + ['Predicted_Penalty', 'Predicted_Score'], mode='w')
for i in range(len(rows)):
    r = rows[i]
    row_to_log = []
    row_predictors = [p[1](r) for p in predictors_to_use]
    for col in col_names:
        row_to_log.append(r[col])
    pred_penalty = regressor.intercept_[0]
    for j in range(len(row_predictors)):
        pred_penalty = pred_penalty + (row_predictors[j] * regressor.coef_[0][j])
    row_to_log.append(pred_penalty)
    pred_score = 10**(-1*pred_penalty)
    row_to_log.append(pred_score)
    log_row(row_to_log)

['Site_ID', 'Longitude', 'Latitude', 'Shortitude', 'Deltitude', 'Pi', 'Murphy', 'Smell_Level', 'Feng_Shui_Quality', 'Air_Apple', 'Air_Burn', 'Air_Copper', 'Air_Mint', 'Sound_Humming', 'Sound_Skittering', 'Sound_Squelching', 'Sound_Buzzing']
Loaded 110809 rows



In [9]:
print('Done!')

Done!
