In [10]:
import pandas as pd
import numpy as np

In [11]:
ds2, ds1 = pd.read_csv('ds-boot-1.csv', sep='\t'), pd.read_csv('ds-boot-2.csv', sep='\t')

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample

In [13]:
print(ds1.head(3))
train, target = ds1.iloc[:, 1:29], ds1.iloc[:, 29:]

print(train.shape)
print(target.shape)

   id       p1   p2   p3       p4   p5       p6       p7       p8       p9  \
0  11  6.48148  3.0  5.0  7.75000  0.0  7.16667  8.16667  9.66667  6.16667   
1  12  5.74074  4.0  8.0  7.33333  8.0  8.83333  9.75000  9.66667  9.00000   
2  25  7.59259  7.0  8.0  7.66667  8.0  9.66667  9.50000  6.16667  9.66667   

   ...       p24      p25  p26      p27  p28   y1     y2    y3    y4    y5  
0  ...   1.66667  3.16667  0.0  0.00000  0.0  1.0   5.00  7.44  1.18  4.38  
1  ...   2.50000  5.50000  5.0  8.66667  8.0  4.5   4.25  8.93  2.00  6.03  
2  ...   3.50000  3.50000  9.0  6.50000  7.0  7.5  11.00  8.97  2.00  9.12  

[3 rows x 34 columns]
(40, 28)
(40, 5)


In [41]:
# part 1.1: estimate stats using bootstrap
def est_stats(sample, n_resamples=100):
    # bootstrap sample size
    b_sample_size = sample.shape[0]
    
    # boostrap resamples means
    b_resamples = np.ndarray((n_resamples, b_sample_size))
    
    for i in range(0, n_resamples):
        b_resamples[i] = resample(sample, n_samples=b_sample_size)
    
    b_resamples_means = [rs.mean() for rs in b_resamples]
    
    return pd.DataFrame(b_resamples_means)
    

In [42]:
# part 1.1: estimate stats using bootstrap for each y_i
for y_i in target:
    sample = target[y_i]
    
    print(y_i, est_stats(sample).describe())

y1                 0
count  100.000000
mean     3.973250
std      0.428996
min      3.050000
25%      3.687500
50%      3.937500
75%      4.265625
max      5.400000
y2                 0
count  100.000000
mean     5.519125
std      0.529692
min      4.106250
25%      5.265625
50%      5.571875
75%      5.853125
max      6.600000
y3                 0
count  100.000000
mean     7.261320
std      0.486111
min      6.148500
25%      6.905250
50%      7.254375
75%      7.607687
max      8.286500
y4                 0
count  100.000000
mean     1.527662
std      0.108961
min      1.234000
25%      1.445000
50%      1.528625
75%      1.598938
max      1.766250
y5                 0
count  100.000000
mean     5.609863
std      0.435735
min      4.666250
25%      5.302875
50%      5.623625
75%      5.869875
max      6.735750


In [69]:
from sklearn.metrics import r2_score

# part 1.2: est coefficients for LR
def est_lr_coeffs(train, y, n_resamples=10):
    sample_size = y.shape[0]
    
    lr = LinearRegression()    
    
    resamples = np.ndarray((n_resamples, sample_size))
    coeffs = np.ndarray((n_resamples, train.shape[1] + 1))
    det_coeffs = np.ndarray((n_resamples, ))
    
    # resampling and finding coefficients of LR for each resample
    for j in range(0, n_resamples):
        resamples[j] = resample(samples, n_samples=sample_size)
        
        lr.fit(train, resamples[j])
        
        y_predicted = lr.predict(train)
        
        det_coeffs[j] = r2_score(y, y_predicted)
        coeffs[j] = np.concatenate((lr.coef_, [lr.intercept_]))
        
    # sort bootstrap samples according to coefficient of determination of LR
    sorted_samples = np.ndarray((n_resamples, sample_size))
    for i, t in enumerate(sorted(zip(det_coeffs, resamples), key=lambda e: e[0], reverse=True)):
        det_coeff, sample = t
        sorted_samples[i] = sample
    
    return sorted_samples

In [68]:
# part 1.2: estimate coefficients for LR for each y_i
for y_i in target:
    target_i = target[y_i]
    
    est_lr_coeffs(train, target_i, n_resamples=100)