In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn

import pandas as pd

from lmfit import minimize, Parameters

%matplotlib inline

In [2]:
data = pd.read_csv('resources/hst_color.csv', header=1, skiprows=[2], index_col=0,
                     skipinitialspace=True)

I need to clean up the data. I need to get the y-color (SDSS) and the x-color (HST) to not have `nan` and to get uncertainties. Also calculate uncertainties from the SNR and the magnitudes.

In [3]:
c = data['blue mag']/data['blueSNR'] + data['red mag']/data['redSNR']
data.insert(10, 'hst uncert', c)
cleanData = data.dropna()

In [4]:
def chi_sqr_linear(params, x, y, yUncert):
    parvals = params.valuesdict()
    m = parvals['slope']
    b = parvals['intercept']
    
    model = m*x + b
    return (model-y)/yUncert

In [5]:
params = Parameters()
params.add('slope', value=1.0)
params.add('intercept', value=0.0)

# x, y, yUncert = sdss, hst, hstUncert
x, y, yUncert = cleanData['sdss color'], cleanData['color'], cleanData['hst uncert']
out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))

In [6]:
out.redchi
# out.chisqr
# out.var_names
# out.ndata
# out.params['slope']
# out.residual

0.036821947060988174

Sort data by smaller of two SNRs. Then calcualte the chi-squared (or something) starting with first 3 data points then adding one more data point at a time, calculating a new chi-squared (or something).

In [7]:
# c = data['blue mag']/data['blueSNR'] + data['red mag']/data['redSNR']
data[['blueSNR', 'redSNR']].min(axis=1)
data.insert(10, 'minSNR', data[['blueSNR', 'redSNR']].min(axis=1))
cleanData = data.dropna()

In [8]:
data.sort_values('minSNR', ascending=False, inplace=True)

In [9]:
data.iloc[:3]

Unnamed: 0_level_0,blueSNR,blueSource,blue mag,redSNR,redSource,red mag,color,g band,r band,sdss color,minSNR,hst uncert
snid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
14284,97.416735,28.512205,15.913608,148.564252,63.398361,14.889089,1.024519,21.026248,20.128068,0.89818,97.416735,0.263576
14437,75.993186,18.19297,16.401425,123.390088,44.655579,15.269595,1.131831,22.76029,21.8072,0.95309,75.993186,0.339578
14279,73.93456,20.627485,16.265069,105.644565,39.466873,15.403702,0.861367,20.891803,20.068498,0.823305,73.93456,0.3658


In [10]:
params = Parameters()
params.add('slope', value=1.0)
params.add('intercept', value=0.0)

x = data.dropna().iloc[:3]['sdss color'].values
y = data.dropna().iloc[:3]['color'].values
yUncert = data.dropna().iloc[:3]['hst uncert'].values
# y, yUncert = cleanData['sdss color'], cleanData['color'], cleanData['hst uncert']
out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))

In [11]:
out.params, out.chisqr

(Parameters([('slope',
              <Parameter 'slope', value=2.0880635863787695 +/- 0.0753, bounds=[-inf:inf]>),
             ('intercept',
              <Parameter 'intercept', value=-0.85468981728953541 +/- 0.0675, bounds=[-inf:inf]>)]),
 0.00038436896394908066)

In [12]:
params = Parameters()
params.add('slope', value=1.0)
params.add('intercept', value=0.0)

x = data.dropna().iloc[:10]['sdss color'].values
y = data.dropna().iloc[:10]['color'].values
yUncert = data.dropna().iloc[:10]['hst uncert'].values
out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))
out.params

Parameters([('slope',
             <Parameter 'slope', value=0.91397076536359967 +/- 0.16, bounds=[-inf:inf]>),
            ('intercept',
             <Parameter 'intercept', value=0.12754283652110204 +/- 0.151, bounds=[-inf:inf]>)])

In [13]:
params = Parameters()
params.add('slope', value=1.0)
params.add('intercept', value=0.0)

x = data.dropna().iloc[:25]['sdss color'].values
y = data.dropna().iloc[:25]['color'].values
yUncert = data.dropna().iloc[:25]['hst uncert'].values
out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))
out.params

Parameters([('slope',
             <Parameter 'slope', value=0.96456633706968486 +/- 0.099, bounds=[-inf:inf]>),
            ('intercept',
             <Parameter 'intercept', value=0.08442936823788022 +/- 0.0916, bounds=[-inf:inf]>)])

In [14]:
params = Parameters()
params.add('slope', value=1.0)
params.add('intercept', value=0.0)

x = data.dropna()['sdss color'].values
y = data.dropna()['color'].values
yUncert = data.dropna()['hst uncert'].values
out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))
out.params

Parameters([('slope',
             <Parameter 'slope', value=0.96456633706968486 +/- 0.099, bounds=[-inf:inf]>),
            ('intercept',
             <Parameter 'intercept', value=0.08442936823788022 +/- 0.0916, bounds=[-inf:inf]>)])

In [15]:
out.chisqr

0.77326088828075168

In [16]:
for i in range(3, len(data.dropna())):
    params = Parameters()
    params.add('slope', value=1.0)
    params.add('intercept', value=0.0)

    x = data.dropna().iloc[:i]['sdss color'].values
    y = data.dropna().iloc[:i]['color'].values
    yUncert = data.dropna().iloc[:i]['hst uncert'].values
    out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))
#     print(out.params['slope'], out.chisqr, out.redchi)
    print(out.chisqr, out.redchi)

0.000384368963949 0.000384368963949
0.186907161587 0.0934535807935
0.242760520256 0.0809201734188
0.242787636851 0.0606969092127
0.303027348547 0.0606054697094
0.337300136641 0.0562166894402
0.451631692764 0.064518813252
0.549566346661 0.0686957933326
0.595268270088 0.0661409188987
0.598471902015 0.0598471902015
0.617737860814 0.0561579873467
0.670268141802 0.0558556784835
0.672112501281 0.051700961637
0.675677260147 0.048262661439
0.678836938574 0.0452557959049
0.690090673314 0.0431306670821
0.692559897481 0.0407388174989
0.705139084064 0.0391743935591
0.766024364983 0.0403170718412
0.766025366933 0.0383012683467


something is wrong with the error or something. I should not have a chi-squared of 0.7?

In [17]:
range(3, len(data.dropna()))

range(3, 23)

In [18]:
for i in range(3, len(data.dropna())):
    params = Parameters()
    params.add('slope', value=1.0)
    params.add('intercept', value=0.0)

    x = data.dropna().iloc[:i]['sdss color'].values
    y = data.dropna().iloc[:i]['color'].values
    yUncert = data.dropna().iloc[:i]['hst uncert'].values
    out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))
#     print(out.params['slope'], out.chisqr, out.redchi)
    print(out.chisqr, out.redchi)

0.000384368963949 0.000384368963949
0.186907161587 0.0934535807935
0.242760520256 0.0809201734188
0.242787636851 0.0606969092127
0.303027348547 0.0606054697094
0.337300136641 0.0562166894402
0.451631692764 0.064518813252
0.549566346661 0.0686957933326
0.595268270088 0.0661409188987
0.598471902015 0.0598471902015
0.617737860814 0.0561579873467
0.670268141802 0.0558556784835
0.672112501281 0.051700961637
0.675677260147 0.048262661439
0.678836938574 0.0452557959049
0.690090673314 0.0431306670821
0.692559897481 0.0407388174989
0.705139084064 0.0391743935591
0.766024364983 0.0403170718412
0.766025366933 0.0383012683467


lets figure out why the errors are so out of wack.

In [19]:
(data.dropna()['color']-data.dropna()['sdss color']).mean(), (data.dropna()['color']-data.dropna()['sdss color']).median()

(0.015928197060869589, 0.012451526800000001)

In [20]:
data.dropna()['hst uncert'].mean(), data.dropna()['hst uncert'].median()

(0.87252211244908617, 0.8870954424901014)

What should the errors be? If they are about the percentage from the min SNR.

In [21]:
(data['color']/data['minSNR']).dropna().mean()

0.021462626988277785

The numerator of the Chi-squared is better then what I calcualted as the error form the magnitudes.

In [22]:
(data['color']/data['minSNR']).dropna().iloc[:3]

snid
14284    0.010517
14437    0.014894
14279    0.011650
dtype: float64

In [28]:
for i in range(3, len(data.dropna())):
    params = Parameters()
    params.add('slope', value=1.0)
    params.add('intercept', value=0.0)

    x = data.dropna().iloc[:i]['sdss color'].values
    y = data.dropna().iloc[:i]['color'].values
#     yUncert = data.dropna().iloc[:i]['hst uncert'].values
    yUncert = (data['color']/data['minSNR']).dropna().iloc[:i].values
    out = minimize(chi_sqr_linear, params, args=(x, y, yUncert))
    print(out.params['slope'].value, out.chisqr, out.redchi)
#     print(out.chisqr, out.redchi)

2.10247492257 0.24271136641 0.24271136641
0.640113232648 127.497224478 63.7486122391
0.891814695184 157.963572712 52.6545242374
0.869374113682 159.839522864 39.959880716
0.939576633572 222.976607626 44.5953215252
0.988440958958 251.245727709 41.8742879516
1.1432399807 556.71925935 79.5313227642
1.09261684935 669.98350966 83.7479387075
1.10803984961 680.201066711 75.5778963012
1.10536407003 681.166538547 68.1166538547
1.11437246437 696.168175491 63.2880159537
1.12062302738 741.251991047 61.7709992539
1.09346493167 782.049870371 60.1576823362
1.09149005227 783.079790086 55.9342707205
1.01051826672 876.818670836 58.4545780557
1.01208896777 895.477289466 55.9673305916
1.01291546978 895.525364079 52.6779625929
1.0544647314 944.676463411 52.4820257451
1.03674815592 1087.60045837 57.2421293881
1.03705461095 1089.41806214 54.4709031072


ok, that looks like just using the min SNR as a percentage understimates the error. Or I have a bug.

In [24]:
(data['color']/data['minSNR']).dropna(), (data['color']-data['sdss color']).dropna()
errors = pd.concat([(data['color']/data['minSNR']).dropna(), (data['color']-data['sdss color']).dropna()], axis=1)
errors.columns = ['snr error', 'residual']

In [29]:
errors.insert(2, 'dif', errors['residual']-errors['snr error'])

In [26]:
errors['dif'].describe()

count    23.000000
mean     -0.005534
std       0.131104
min      -0.318351
25%      -0.096762
50%      -0.007392
75%       0.104640
max       0.222043
Name: dif, dtype: float64

In [None]:
errors.insert(3, 'chi-square', errors['residual']/errors['snr error'])

In [30]:
errors['chi-square'].describe()

count    23.000000
mean      0.509213
std       7.069068
min     -15.643945
25%      -3.242996
50%       0.628744
75%       5.217930
max      12.012958
Name: chi-square, dtype: float64

In [35]:
errors.sort_values(by='chi-square')

Unnamed: 0_level_0,long form errors,snr error,residual,dif,chi-square
snid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15229,0.73704,0.009997,-0.156396,-0.166393,-15.643945
13411,1.409242,0.025934,-0.292418,-0.318351,-11.275526
18835,0.753576,0.021796,-0.191744,-0.21354,-8.797336
19323,1.462386,0.012837,-0.085096,-0.097933,-6.629014
19174,0.39517,0.016615,-0.081325,-0.09794,-4.894812
19048,0.999663,0.018337,-0.077254,-0.095591,-4.212943
12860,1.533059,0.031082,-0.07065,-0.101732,-2.273049
6057,0.593439,0.012991,-0.018924,-0.031914,-1.456669
1415,0.488429,0.026833,-0.028205,-0.055037,-1.051139
18241,0.64746,0.012747,-0.004554,-0.017302,-0.35729


In [34]:
errors.insert(0, 'long form errors', data.dropna()['hst uncert'])

In [36]:
errors.sort_values(by='residual')

Unnamed: 0_level_0,long form errors,snr error,residual,dif,chi-square
snid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13411,1.409242,0.025934,-0.292418,-0.318351,-11.275526
18835,0.753576,0.021796,-0.191744,-0.21354,-8.797336
15229,0.73704,0.009997,-0.156396,-0.166393,-15.643945
19323,1.462386,0.012837,-0.085096,-0.097933,-6.629014
19174,0.39517,0.016615,-0.081325,-0.09794,-4.894812
19048,0.999663,0.018337,-0.077254,-0.095591,-4.212943
12860,1.533059,0.031082,-0.07065,-0.101732,-2.273049
1415,0.488429,0.026833,-0.028205,-0.055037,-1.051139
6057,0.593439,0.012991,-0.018924,-0.031914,-1.456669
18241,0.64746,0.012747,-0.004554,-0.017302,-0.35729


In [37]:
errors.columns

Index(['long form errors', 'snr error', 'residual', 'dif', 'chi-square'], dtype='object')

In [46]:
errors.columns = ['errors - SNR propogation', 'errors - min SNR', 'model residual', 'dif: min SNR & Res', 'chi-square: min SNR & Res']

In [43]:
errors.sort_values(by='errors - SNR propogation').index

Int64Index([14284, 14437, 14279, 19174,  1415, 14871,  6057, 18241,  3488,
            15229, 18835, 14113,  6614, 17886, 19048, 20874,  2992, 17745,
             2635, 13411,  2561, 19323, 12860],
           dtype='int64', name='snid')

In [48]:
errors.sort_values(by='errors - min SNR').index

Int64Index([15229, 14284, 14871, 14279, 18241, 19323,  6057, 14437, 17745,
            19174, 19048,  2635,  6614, 20874, 18835, 13411,  1415, 12860,
            17886, 14113,  2992,  3488,  2561],
           dtype='int64', name='snid')

These are not the same order at all: 15229, 12860, and 19323 as examples. All three have adnomally hight min SNR.

In [58]:
# errors.sort_index()
errors.loc[[15229, 12860, 19323]]

Unnamed: 0_level_0,errors - SNR propogation,errors - min SNR,model residual,dif: min SNR & Res,chi-square: min SNR & Res
snid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15229,0.73704,0.009997,-0.156396,-0.166393,-15.643945
12860,1.533059,0.031082,-0.07065,-0.101732,-2.273049
19323,1.462386,0.012837,-0.085096,-0.097933,-6.629014


In [61]:
data.loc[[15229, 12860, 19323]][['blueSNR', 'redSNR']]

Unnamed: 0_level_0,blueSNR,redSNR
snid,Unnamed: 1_level_1,Unnamed: 2_level_1
15229,43.44267,50.667145
12860,20.733757,28.064364
19323,23.962759,25.614195


They all have fairly high red SNR

In [63]:
data.loc[[15229, 12860, 19323]][['color', 'sdss color']]

Unnamed: 0_level_0,color,sdss color
snid,Unnamed: 1_level_1,Unnamed: 2_level_1
15229,0.434305,0.590701
12860,0.64444,0.71509
19323,0.307608,0.392704


Their colors are not too far off. 15 is off by more then 0.15, but 12 is off by 0.07.

Lets see what happens with out errors, just fit `data-model`.

In [65]:
def residual(params, x, y):
    parvals = params.valuesdict()
    m = parvals['slope']
    b = parvals['intercept']
    
    model = m*x + b
    return y-model

for i in range(3, len(data.dropna())):
    params = Parameters()
    params.add('slope', value=1.0)
    params.add('intercept', value=0.0)

    x = data.dropna().iloc[:i]['sdss color'].values
    y = data.dropna().iloc[:i]['color'].values
#     yUncert = data.dropna().iloc[:i]['hst uncert'].values
    out = minimize(residual, params, args=(x, y), method = 'newton')
#     print(out.params['slope'], out.chisqr, out.redchi)
    print(out.chisqr, out.redchi)

3.35099439153e-05 3.35099439153e-05
0.0231408990113 0.0115704495057
0.0317681034773 0.0105893678258
0.0317782142092 0.00794455355229
0.0435796786774 0.00871593573547
0.0489822709616 0.00816371182693
0.0856673166768 0.0122381880967
0.122833907222 0.0153542384028
0.152275183681 0.0169194648534
0.152417453902 0.0152417453902
0.157974771566 0.0143613428696
0.218023869369 0.0181686557807
0.228173868259 0.0175518360199
0.229255520985 0.0163753943561
0.24218825123 0.0161458834153
0.261118872785 0.016319929549
0.262652011165 0.0154501183038
0.278489025983 0.0154716125546
0.379254691885 0.0199607732571
0.380947501632 0.0190473750816
