# HW 2: Apply Linear Regression to Construct a Prostate Cancer Model & Digits Data Model

Blanca Miller
<br>
STAT 760
<br>
02/06/2018

Prostrate Data Info: http://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.info.txt 
<br>
Data Set: http://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data

Given a training set of prostate cancer linear data, estimate the model parameters using subset selection. 

## STEPS
1. import data
2. break data into two groups: train and test
3. break the training set into two matrices
    - X : design matrix (add column of 1s at beginning)
    - y : vector of responses
4. convert the X and y data frames into numpy arrays
5. standardize the predictors to have unit variance
6. estimate weights
7. calculate RSS for training set for all possible models
8. Graph k by RSS


## FUNCTIONS
- Fxn evaluates RSS for given X, y, beta
- Fxn trains models given list of columns
- Fxn generates all possible list of columns
- Fxn plot dictionary

## Imports

In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import datasets
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
data = pd.read_csv('prostate.data', delimiter='\t')

In [3]:
print(data[0:10])

   Unnamed: 0    lcavol   lweight  age      lbph  svi       lcp  gleason  \
0           1 -0.579818  2.769459   50 -1.386294    0 -1.386294        6   
1           2 -0.994252  3.319626   58 -1.386294    0 -1.386294        6   
2           3 -0.510826  2.691243   74 -1.386294    0 -1.386294        7   
3           4 -1.203973  3.282789   58 -1.386294    0 -1.386294        6   
4           5  0.751416  3.432373   62 -1.386294    0 -1.386294        6   
5           6 -1.049822  3.228826   50 -1.386294    0 -1.386294        6   
6           7  0.737164  3.473518   64  0.615186    0 -1.386294        6   
7           8  0.693147  3.539509   58  1.536867    0 -1.386294        6   
8           9 -0.776529  3.539509   47 -1.386294    0 -1.386294        6   
9          10  0.223144  3.244544   63 -1.386294    0 -1.386294        6   

   pgg45      lpsa train  
0      0 -0.430783     T  
1      0 -0.162519     T  
2     20 -0.162519     T  
3      0 -0.162519     T  
4      0  0.371564     T  
5

## Functions

In [4]:
#evaluates RSS for given X, y, beta
def evaluateRSS(df, predictor_subset, beta):
    
    # generate predictor df
    predictor_df = df[predictor_subset]
    
    # generate response df
    response_df = df["lpsa"]
    
    # convert predictor df to ndarray
    predictor_matrix = predictor_df.as_matrix()
    
    # normalize predictors
    predictor_matrix = (predictor_matrix - np.mean(predictor_matrix, axis=0))/np.std(predictor_matrix, axis=0)

    # adds constant term 1s to predictor matrix
    predictor_matrix = np.c_[np.ones(len(predictor_matrix)), predictor_matrix]
    
    # convert response df to ndarray
    response_matrix = response_df.as_matrix()
    
    # compute error
    e = response_matrix - np.dot(predictor_matrix, beta)
    
    # return error
    return np.dot(e,e)

In [5]:
# train model given a list of columns
def train(df, predictors_subset):
    
    # generate predictor df
    predictor_df = df[predictors_subset]
    
    # generate response df
    response_df = df["lpsa"]
    
    # convert predictor df to ndarray
    predictor_matrix = predictor_df.as_matrix()
    
    # normalize predictors
    predictor_matrix = (predictor_matrix - np.mean(predictor_matrix, axis=0))/np.std(predictor_matrix, axis=0)

    # adds constant term 1s to predictor matrix
    predictor_matrix = np.c_[np.ones(len(predictor_matrix)), predictor_matrix]
    
    # convert response df to ndarray
    response_matrix = response_df.as_matrix()
    
    # fit model by pinv:
    # calculate psuedo-inverse
    predictor_inverse = np.linalg.pinv(predictor_matrix)
    
    # multiply pseudo-inverse by response matrix
    beta = np.dot(predictor_inverse, response_matrix)
    
    # return weights
    return beta

In [6]:
# generates all possible list of columns 
def generate_columns(predictors):
    
    # return all possible subset of predictors - n choose k
    n = len(predictors)
    
    subsets = []
    
    for k in range(1, n+1):
        x = list(itertools.combinations(predictors, k))
        x = [list(y) for y in x]
        subsets += x 
        
    return subsets

In [7]:
def plot(rss_dict):
    
    # plots subset by rss
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    x = rss_dict[:,0]
    y = rss_dict[:,1]
    ax.scatter(x,y)
    plt.show()

## Part A: Prostate Cancer Model

## Train Prostate Cancer Models

In [8]:
predictors = list(data.columns.values[1:9])

In [9]:
train_data = data[data.train == 'T']
test_data = data[data.train == 'F']

In [10]:
predictor_subsets = generate_columns(predictors)
rss_values = {k:[] for k in range(1,9)}

In [11]:
for p in predictor_subsets:
    beta = train(train_data, p)
    rss = evaluateRSS(train_data, p, beta)
    rss_values[len(p)].append(rss)

In [12]:
rss_values

{1: [44.52858265645385,
  73.613540185775349,
  91.292039020755482,
  89.624912082459502,
  66.42240272124414,
  73.239391316218757,
  84.991790459378038,
  76.953236677263277],
 2: [37.091845632561338,
  44.495642164067419,
  39.992304351212333,
  42.312584301379616,
  44.467567998531834,
  44.4240781492555,
  43.423103787681718,
  73.028097216638173,
  73.305459530268365,
  51.714246792470377,
  56.768034339535895,
  63.059181884423303,
  57.175054366940252,
  87.197409584606007,
  64.044028773554814,
  71.208857866843857,
  83.827432099917743,
  75.824639480492976,
  55.044014020601949,
  64.137962336059871,
  78.888337266541228,
  69.582295055713956,
  64.088802847943214,
  63.294934863258113,
  62.362702781956301,
  71.749951063174393,
  70.603509520730938,
  76.950883206566914],
 3: [36.817229416000032,
  36.015167105705927,
  34.907748856567864,
  37.089787447798514,
  36.658492427702328,
  35.434033239986199,
  39.802309202295788,
  42.244606971046714,
  44.437773535643153,
  4

## Plot RSS for Subsets

In [13]:
# set x values according to the key and corresponding length of its dictionary
x = [j for j in range(1,9) for k in range(len(rss_values[j]))]

# set y values to k subsets(1-8)
y = []
for k in rss_values.keys():
    y += rss_values[k]

# graph scatter plot
plt.scatter(x,y)
plt.title("All Possible Subset Models for Prostate Cancer Data")
plt.xlabel("Subset Size k")
plt.ylabel("Residual Sum-Of-Squares")

Text(0,0.5,u'Residual Sum-Of-Squares')

## Part B: Digits Data Model

In [14]:
# Load the data set
digits = datasets.load_digits()

# Training data set
X_digits = digits.data

# Target data set 
y_digits = digits.target

# Allocate 2/3 of the data set as training & 1/3 as testing
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, test_size=0.33)

# Print size of data subsets to verify splitting
print("Digits data set: {}".format(digits.data.shape))
print("X_train data set: {}".format(X_train.shape))
print("y_train target array: {}".format(y_train.shape))
print("X_test data set: {}".format(X_test.shape))
print("y_test target array: {}".format(y_test.shape))

Digits data set: (1797, 64)
X_train data set: (1203, 64)
y_train target array: (1203,)
X_test data set: (594, 64)
y_test target array: (594,)


In [15]:
# Fit a linear model to the training data set
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
print(regr.coef_)

[  8.98872033e-16   1.46030777e-03   5.67370039e-02  -3.44354433e-02
   6.93768173e-02  -1.22055438e-02  -4.92812263e-02  -1.59493441e-03
   1.26942504e+00  -4.53570596e-03   8.12224233e-02   6.17047291e-02
  -7.15957298e-02  -7.36968418e-02   9.38501100e-02   2.40263150e-01
  -1.29807199e-01  -4.32143356e-04   8.82414754e-02  -2.88116819e-02
  -6.40551403e-02   4.44380084e-02  -6.45103484e-02  -1.99037051e-01
  -1.67643677e-14  -1.43689159e-01   3.37046746e-02   8.62190092e-02
   6.99903529e-02   1.12719074e-01  -2.22544850e-02  -3.38887435e+00
   5.49560397e-15  -1.77191761e-01  -2.42842528e-02   1.51869405e-01
  -5.07588570e-02   4.21591107e-02  -1.10609495e-02   1.62092562e-14
   1.41682320e-01   1.62341918e-01  -3.23553137e-02  -1.34145281e-02
   1.35632981e-01   6.07393357e-02   3.12205713e-03   1.61412544e-01
   7.56059713e-01  -7.17522217e-02   3.01909620e-02  -6.68465764e-02
  -2.31248362e-01  -3.47702027e-02   8.63351025e-02  -1.40660253e-01
   0.00000000e+00  -5.12906035e-02

In [16]:
# Residual sum of squares
np.sum((regr.predict(X_test) - y_test)**2)

2266.5274397486091

In [25]:
# Explained variance score: 1 is a perfect prediction & 0 is no linear relationship between X and y
reg_score = regr.score(X_test, y_test)
reg_score

0.5468458685817148

In [18]:
predictors = list(X_digits[0])
print(predictors)

[0.0, 0.0, 5.0, 13.0, 9.0, 1.0, 0.0, 0.0, 0.0, 0.0, 13.0, 15.0, 10.0, 15.0, 5.0, 0.0, 0.0, 3.0, 15.0, 2.0, 0.0, 11.0, 8.0, 0.0, 0.0, 4.0, 12.0, 0.0, 0.0, 8.0, 8.0, 0.0, 0.0, 5.0, 8.0, 0.0, 0.0, 9.0, 8.0, 0.0, 0.0, 4.0, 11.0, 0.0, 1.0, 12.0, 7.0, 0.0, 0.0, 2.0, 14.0, 5.0, 10.0, 12.0, 0.0, 0.0, 0.0, 0.0, 6.0, 13.0, 10.0, 0.0, 0.0, 0.0]


## Plot RSS for Subsets

In [19]:
rss_values = {k for k in range(1,64)}
print(rss_values)

set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63])


In [20]:
for i in range(X_train.shape[0]):
    beta = train(X_train, p)
    rss = evaluateRSS(X_train, p, beta)
    rss_values[len(p)].append(rss)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [29]:
for i in range(64):
    plt.figure()
    plt.plot(X_test, reg_score)

ValueError: x and y must have same first dimension, but have shapes (594, 64) and (1,)