In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample

  from pkg_resources import resource_stream


**Load and split datasets to train and target parts**

In [2]:
ds2, ds1 = pd.read_csv('ds-boot-1.csv', sep='\t'), pd.read_csv('ds-boot-2.csv', sep='\t')
train, target = ds1.iloc[:, 1:29], ds1.iloc[:, 29:]

### Part 1.1

In [3]:
# part 1.1: estimate stats using bootstrap
def est_stats(sample, n_resamples=100):
    # bootstrap sample size
    b_sample_size = sample.shape[0]
    
    # boostrap resamples
    b_resamples = np.ndarray((n_resamples, b_sample_size))
    
    for i in range(0, n_resamples):
        b_resamples[i] = resample(sample, n_samples=b_sample_size)
    
    b_resamples_means = [rs.mean() for rs in b_resamples]
    
    return pd.DataFrame(b_resamples_means)
    

In [4]:
# part 1.1: estimate stats using bootstrap for each y_i
for y_i in target:
    sample = target[y_i]
    
    print(y_i, est_stats(sample).describe())

y1                 0
count  100.000000
mean     4.079250
std      0.483970
min      2.800000
25%      3.825000
50%      4.068750
75%      4.390625
max      5.162500
y2                 0
count  100.000000
mean     5.553250
std      0.607600
min      3.850000
25%      5.223437
50%      5.559375
75%      5.854687
max      7.093750
y3                 0
count  100.000000
mean     7.386190
std      0.418886
min      6.115500
25%      7.095875
50%      7.445125
75%      7.666562
max      8.327250
y4                 0
count  100.000000
mean     1.544705
std      0.123725
min      1.107000
25%      1.464687
50%      1.552375
75%      1.631875
max      1.764000
y5                 0
count  100.000000
mean     5.593275
std      0.433189
min      4.107750
25%      5.331250
50%      5.629125
75%      5.901000
max      6.922250


### Part 1.2

In [18]:
from sklearn.metrics import r2_score

# part 1.2: est coefficients for LR
def est_lr_coeffs(train, y, n_resamples=10):
    sample_size = y.shape[0]
    
    lr = LinearRegression()    
    
    resamples = np.ndarray((n_resamples, sample_size))
    bootstrap_coeffs = np.ndarray((n_resamples, train.shape[1] + 1))
    det_coeffs = np.ndarray((n_resamples, ))
    
    # find coefficients for original y sample
    lr.fit(train, y)
    origin_coeffs = np.concatenate((lr.coef_, [lr.intercept_]))
    
    # resampling and finding coefficients of LR for each resample
    for j in range(n_resamples):
        resamples[j] = resample(y, n_samples=sample_size)
        
        lr.fit(train, resamples[j])
        
        y_predicted = lr.predict(train)
        
        det_coeffs[j] = r2_score(y, y_predicted)
        bootstrap_coeffs[j] = np.concatenate((lr.coef_, [lr.intercept_]))
    
    # find 95-% confidence interval 
    deltas = origin_coeffs - bootstrap_coeffs
    deltas_l = np.percentile(deltas, 2.5, axis=0)
    deltas_r = np.percentile(deltas, 97.5, axis=0)
    l_bound = origin_coeffs - deltas_r
    upp_bound = origin_coeffs - deltas_l
    
    return pd.DataFrame([{'est': origin_coeffs[i],
                          'low_bound': l_bound[i],
                          'upp_bound': upp_bound[i]
                         } 
                         for i in range(train.shape[1])])
    
    # sort bootstrap samples according to coefficient of determination of LR
#     sorted_samples = np.ndarray((n_resamples, sample_size))
#     for i, t in enumerate(sorted(zip(det_coeffs, resamples), key=lambda e: e[0], reverse=True)):
#         det_coeff, sample = t
#         sorted_samples[i] = sample
    
#     return sorted_samples

In [19]:
# part 1.2: estimate coefficients for LR for each y_i
for y_i in target:
    target_i = target[y_i]
    
    print('\n', y_i, '\n', est_lr_coeffs(train, target_i, n_resamples=1000))


 y1 
          est  low_bound  upp_bound
0   0.352388  -0.881380   0.779563
1  -0.087259  -0.730935   0.691113
2  -0.472046  -1.398908   1.422553
3   1.121283  -1.119077   1.228151
4  -0.325610  -1.025036   0.977775
5   0.916609  -1.573219   1.439337
6  -1.070163  -1.560334   1.457515
7  -0.210380  -1.053291   1.053450
8  -0.184591  -1.153781   1.159332
9  -0.342690  -1.310621   1.291684
10  0.764015  -1.195226   1.248917
11  0.040134  -1.383003   1.538060
12  0.219712  -0.981638   1.066591
13 -0.132371  -0.751232   0.763624
14 -0.801004  -0.884501   0.817310
15  0.255596  -0.440100   0.419264
16  0.577988  -0.798772   0.887633
17 -0.121128  -0.590805   0.631967
18 -0.418202  -0.773553   0.711005
19 -0.259606  -0.785557   0.726010
20  0.793505  -0.852895   0.810650
21 -0.113371  -0.696747   0.729555
22  0.410392  -0.566868   0.602382
23 -0.382840  -0.767325   0.780405
24  0.603136  -1.093178   1.167162
25  0.136971  -0.836014   0.794658
26 -0.397624  -0.929355   0.897236
27 -0.103562 