**Imports**

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

# Feature Selection
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# Regression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import LassoLars
from sklearn.neural_network import MLPRegressor

import matplotlib.mlab as mlab

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

In [2]:
# Each record in the training data loads as a tuple.
# This function take an array of tuples and turns into a 2D numpy array
def SplitTuples(data):
    new_data = []
    for i in range(data.shape[0]):
        new_data.append(list(data[i]))
    data = np.array(new_data)
    return data

In [3]:
# Binarize the depth feature
def binarizeColumn(data,col,string):
    for i in range(data.shape[0]):
        data[i,col] = np.where(data[i,col] == string,1.0,0.0)
    return data

In [4]:
class AfricanDataSplit:
    # Initialize an instance of the class.
    # This class enables us to do PCA on only the wavelength portion of the 
    # Arican soil sample data, while leaving the rest alone
    
    def __init__(self, maxcol = 3577, pca_components=20):
        
        self.maxcol = maxcol
        self.pca_components = pca_components
        self.pca = PCA(n_components=pca_components)
        
    def fit(self, data, junk):
        
        maxcol = self.maxcol
        pca_components = self.pca_components
        
        # Split data into two sections       
        left_data = data[:,0:maxcol]
        right_data = data[:,maxcol:]
        
        # Fit PCA to the left data
        self.pca.fit(left_data)
        
        # Output variance fractions
        #print '\n-------------------------------------------'
        #print 'Fraction of the total variance in the training explained by first k components: \n'
        #for k in range(1,pca_components):
        #    s = sum(self.pca.explained_variance_ratio_[0:k]) 
        #    #SANITY CHECK: print np.cumsum(pca_all.explained_variance_ratio_)[k-1]
        #    print("%d \t %s" % (k, '{0:.2f}%'.format(s * 100)))
        

    # Make predictions for each test example and return results.
    def transform(self, data):

        # Get the split point
        maxcol = self.maxcol
        
        # Split the data into two sections       
        left_data = data[:,0:maxcol]
        right_data = data[:,maxcol:]
        
        # Transform the left data
        new_left_data = self.pca.transform(left_data)

        
        # Concatenate into new dataset
        new_data = np.concatenate((new_left_data, right_data), axis = 1)
        
        return new_data
    

**Loading Data**

In [5]:
# Load training data
X = np.genfromtxt('training.csv', 
                  delimiter=',', 
                  dtype=None,
                  skip_header = 1,
                  usecols=range(1, 3595)) # Load columns 1 to 3594 inclusive

X = SplitTuples(X)
X = binarizeColumn(X,3593,'Topsoil')
X = X.astype('float64')

n = np.genfromtxt('training.csv', 
                  delimiter=',', 
                  max_rows = 1,
                  names = True,
                  usecols=range(1, 3595)) # Load columns 1 to 3594 inclusive

# Extract feature names
feature_names = np.asarray(n.dtype.names)

PIDN = np.genfromtxt('training.csv',
                    delimiter=',',
                    dtype=None,
                    skip_header = 1,
                    usecols=0) # Load the PIDN for reference

Ca = np.genfromtxt('training.csv', 
                   delimiter=',', 
                   dtype=None,
                   skip_header = 1,
                   usecols=3595) # Load Mehlich-3 extractable Calcium data

P = np.genfromtxt('training.csv', 
                   delimiter=',', 
                   dtype=None,
                   skip_header = 1,
                   usecols=3596) # Load Mehlich-3 extractable Phosphorus data

pH = np.genfromtxt('training.csv', 
                   delimiter=',', 
                   dtype=None,
                   skip_header = 1,
                   usecols=3597) # Load pH data

SOC = np.genfromtxt('training.csv', 
                    delimiter=',', 
                    dtype=None,
                    skip_header = 1,
                    usecols=3598) # Load Soil Organic Carbon data

Sand = np.genfromtxt('training.csv', 
                     delimiter=',', 
                     dtype=None,
                     skip_header = 1,
                     usecols=3599) # Load Sand Content data

In [6]:
# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Ca, P, pH, SOC, Sand = X[shuffle], Ca[shuffle], P[shuffle], pH[shuffle], SOC[shuffle], Sand[shuffle] 

# Define the size of the evaluation data set
evalSetSize = 100

# Define the size of the dev data set
devSetSize = 100

# Total size to hold out
holdOutSize = evalSetSize + devSetSize

eval_data = X[0:evalSetSize]
eval_Ca_labels = Ca[0:evalSetSize]
eval_P_labels = P[0:evalSetSize]
eval_pH_labels = pH[0:evalSetSize]
eval_SOC_labels = SOC[0:evalSetSize]
eval_Sand_labels = Sand[0:evalSetSize]

dev_data = X[evalSetSize:holdOutSize]
dev_Ca_labels = Ca[evalSetSize:holdOutSize]
dev_P_labels = P[evalSetSize:holdOutSize]
dev_pH_labels = pH[evalSetSize:holdOutSize]
dev_SOC_labels = SOC[evalSetSize:holdOutSize]
dev_Sand_labels = Sand[evalSetSize:holdOutSize]

dev_labels = [dev_Ca_labels, dev_P_labels, dev_pH_labels, dev_SOC_labels, dev_Sand_labels]
eval_labels = [eval_Ca_labels, eval_P_labels, eval_pH_labels, eval_SOC_labels, eval_Sand_labels]

outcome_vars = ['Ca', 'P', 'pH', 'Soc', 'Sand']

train_data = X[holdOutSize:]
train_Ca_labels = Ca[holdOutSize:]
train_P_labels = P[holdOutSize:]
train_pH_labels = pH[holdOutSize:]
train_SOC_labels = SOC[holdOutSize:]
train_Sand_labels = Sand[holdOutSize:]
train_labels = [train_Ca_labels, train_P_labels, train_pH_labels, train_SOC_labels, train_Sand_labels]

print(eval_Ca_labels.shape)
print(dev_Ca_labels.shape)
print(train_Ca_labels.shape)


print('Number of features:', dev_data.shape[1])
print('Number of training examples:', train_data.shape[0])
print('Number of dev examples:', dev_data.shape[0])
print('Number of eval examples:', eval_data.shape[0])


# Load test data
test_x = np.genfromtxt('sorted_test.csv', 
                                delimiter=',', 
                                dtype=None,
                                skip_header = 1,
                                usecols=range(1, 3595)) # Load columns 0 to 3594 inclusive

test_ids = np.genfromtxt('sorted_test.csv', 
                                delimiter=',', 
                                dtype=None,
                                skip_header = 1,
                                usecols=0) # Load columns 0 to 3594 inclusive

test_x = SplitTuples(test_x)
test_x = binarizeColumn(test_x,3593,'Topsoil')
test_x = test_x.astype('float64')

print('Number of test examples:', test_x.shape[0])

scaler = Normalizer().fit(train_data)   
transformedTrainData = scaler.transform(train_data)
transformedDevData = scaler.transform(dev_data)


#scaler = Normalizer().fit(train_data)
transformedEvalData = scaler.transform(eval_data)
#transformedEvalData = eval_data

#scaler = Normalizer().fit(train_data)
normalizedTestData = scaler.transform(test_x)
#normalizedTestData = test_x



(100L,)
(100L,)
(957L,)
('Number of features:', 3594L)
('Number of training examples:', 957L)
('Number of dev examples:', 100L)
('Number of eval examples:', 100L)
('Number of test examples:', 727L)


Experiment on PCA: Analyze the explaned variances for PCA over our features. We observe that the first 20 components explain increasing portions of the variance, however after 20 components, the subsequent ones don't really help.

**Feature selectors**

In [7]:
def getFeatureSelectors():
     return [
        #['lasso', SelectFromModel(LassoCV()) ], # Doesn't work
        #['linearc0.01', SelectFromModel(LinearSVC(C=0.01, penalty="l1")) ],
        #['linearc0.1', SelectFromModel(LinearSVC(C=0.1, penalty="l1")) ],
        #['linearc11', SelectFromModel(LinearSVC(C=1, penalty="l1")) ],
        #['kbest100', SelectKBest(k=100)],
        #['kbest250', SelectKBest(k=250)],
        #['pca5', PCA(n_components=5)],
        #['pca10', PCA(n_components=10)],
        ['ads', AfricanDataSplit(pca_components=20)],
        ['pca20', PCA(n_components=20)],
        ['pca30', PCA(n_components=30)],
        ['pca30rbf', KernelPCA(n_components=30,kernel='rbf')],
        ['pca20kbest5', FeatureUnion([("pca5", PCA(n_components=20)), ("kbest5", SelectKBest(k=5))])],
        ['pca20kbest50', FeatureUnion([("pca5", PCA(n_components=20)), ("kbest50", SelectKBest(k=50))])],
        ['pca20kbest250', FeatureUnion([("pca5", PCA(n_components=20)), ("kbest250", SelectKBest(k=250))])],
     ]


**Classifiers**

In [8]:
# Predict the mean value of an array
def PredictMean(labels):
    mean = np.mean(labels)
    return mean

In [9]:
# Get the means of the dev data for our reference

Ca_mean = PredictMean(dev_Ca_labels)
print('Calcium Mean: ', Ca_mean)

P_mean = PredictMean(dev_P_labels)
print('Phosphorus Mean: ', P_mean)

pH_mean = PredictMean(dev_pH_labels)
print('pH Mean: ', pH_mean)

SOC_mean = PredictMean(dev_SOC_labels)
print('SOC Mean: ', SOC_mean)

Sand_mean = PredictMean(dev_Sand_labels)
print('Sand Mean: ', Sand_mean)

('Calcium Mean: ', 0.060906193376832396)
('Phosphorus Mean: ', 0.042220159248226287)
('pH Mean: ', 0.027768020593356191)
('SOC Mean: ', 0.021567274983507304)
('Sand Mean: ', 0.058996238160944625)


In [10]:
def getClassifiers():
     return [
        ['KNN', KNeighborsRegressor(), {'n_neighbors':[1, 2, 3, 5, 8]}],
        ['SVRdict', SVR(cache_size=200), {'C':[0.1,1.0,100.0,1000.0],'kernel':['linear','rbf']}],
        #['SVR', SVR(cache_size=200, kernel='linear', C=1.0, epsilon=0.05, shrinking=False),{}]
        ['Lasso', Lasso(), {'alpha':[0.01, 0.05, 0.25, 0.9]}],
        #['LassoLars', LassoLars(), {'alpha':[0.01, 0.1, 0.5, 1.0]}],            
        ['RandomForest', RandomForestRegressor(), {'n_estimators':[1, 2, 3, 5, 8]}],
        ['nn', MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(2, 2)), {}]
        #['RandomForest', RandomForestRegressor(), {'n_estimators':[1, 2, 3, 5, 8]}]
    ]


**Test combinations of selectors and classifiers**

In [11]:


# For each outcome variable, for each classifier and for each selector, it will obtain and print 
# the best hyperparameters and inmediately print the mean squared error.
# Finally, after finishing the calculations, it will print the methods and score ordered by score (MSE)
# To see what would be the final score we'd get, you can get the best MSE score for each outcome variable,
# and calculate the average of them.
def run():
    
    allResults = []
    
    #scaler = Normalizer().fit(train_data)   
    #transformedTrainData = scaler.transform(train_data)
    #transformedDevData = scaler.transform(dev_data)

    
    for outcomeVarIndex in range(0, 5):
        print('*************************************************************')
        print('Outcome Variable:', outcome_vars[outcomeVarIndex])
        print('*************************************************************')

        results = []
            
        scaler = Normalizer().fit(train_data)
        
        # Get the mean value of the outcome variable
        DevMean = PredictMean(dev_labels[outcomeVarIndex])

        for selector in getFeatureSelectors():
            
            #selectedTrainData = selector[1].fit(transformedTrainData, train_labels[outcomeVarIndex]).transform(transformedTrainData)
            selector[1].fit(transformedTrainData, train_labels[outcomeVarIndex])
            selectedTrainData = selector[1].transform(transformedTrainData)
            selectedDevData = selector[1].transform(transformedDevData)
                
            for classifier in getClassifiers():

                print('-------------------------------------------------------')
                print(selector[0] + ' ' + classifier[0])

                grid_search = GridSearchCV(classifier[1], param_grid=classifier[2],cv=5)

                grid_search.fit(selectedTrainData, train_labels[outcomeVarIndex])
                print(grid_search.best_estimator_)

                # Mean Squared Error:  (y_true - y_pred)**2.sum()
                meanSquaredError = 0.0
                for i in range(len(selectedDevData)):
                    diff = grid_search.predict(selectedDevData[i].reshape(1, -1)) - dev_labels[outcomeVarIndex][i]
                    squaredDiff = diff ** 2
                    meanSquaredError = meanSquaredError + squaredDiff
                    
                #meanSquaredError = meanSquaredError / float(len(selectedDevData))
                
                # Residual Sum of squares:  (y_true - y_mean)**2.sum()
                residualSquaredError = 0.0
                for i in range(len(selectedDevData)):
                    diff = dev_labels[outcomeVarIndex][i] - DevMean
                    squaredDiff = diff ** 2
                    residualSquaredError = residualSquaredError + squaredDiff     
                    
                myScore = 1 - meanSquaredError/residualSquaredError
                
                print('Mean Squared Error: ', str(meanSquaredError / float(len(selectedDevData))))
                print('Residual Squared Error: ', str(residualSquaredError))
                print('Calculated Score: ', str(1 - meanSquaredError/residualSquaredError))
                print('Score: ' + str(grid_search.score(selectedDevData, dev_labels[outcomeVarIndex])))
                
                # Store in an array, for each combination, the following:
                # [selector name, classifier name, mean squared error, selector instance, classifier instance]
                results.append([selector[0], classifier[0], myScore, selector[1], grid_search])

                                                            
        sortedResults = sorted(results, key=lambda result: result[2], reverse=True)
        for result in sortedResults:
            print('Selector: ' + str(result[0]) + ', Classifier: ' + str(result[1])  + ', Score: ' + str(result[2]))
        
        copyResults = sortedResults[:]
        allResults.append(copyResults)
        
    #TODO Calculate columnwise mean of the mean squared error
    #Each item has the best result on item 0
    squaredErrorSum = 0.0
    
    bestModels = []
    print('-------------------------------------------------------')
    print('Best Results')
    print('-------------------------------------------------------')
    for i in range(len(allResults)):

        columnResults = allResults[i]
        bestColumnResult = columnResults[0]
        squaredError = bestColumnResult[2]
        print('Outcome Variable: ' + outcome_vars[i] + ', Selector: ' + str(bestColumnResult[0]) + 
              ', Classifier: ' + str(bestColumnResult[1])  + ', Score: ' + str(bestColumnResult[2]))
        squaredErrorSum = squaredErrorSum + squaredError
        bestModels.append(bestColumnResult)
        
        
    print('Best result obtained: ' + str(squaredErrorSum / 5.0))
    
    return bestModels

models = run()


*************************************************************
('Outcome Variable:', 'Ca')
*************************************************************
-------------------------------------------------------
ads KNN
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')
('Mean Squared Error: ', '[ 0.20256988]')
('Residual Squared Error: ', '126.489297838')
('Calculated Score: ', '[ 0.83985216]')
Score: 0.839852161391
-------------------------------------------------------
ads SVRdict
SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
('Mean Squared Error: ', '[ 0.23236578]')
('Residual Squared Error: ', '126.489297838')
('Calculated Score: ', '[ 0.8162961]')
Score: 0.816296096016
-------------------------------------------------------
ads Lasso
Lasso(alpha=0.01, copy_X=True, fit_int

**Predictions based on our dev data**


Test on the evaluation data

In [12]:
myScore = []

#scaler = Normalizer().fit(train_data)
#transformedEvalData = scaler.transform(eval_data)
#transformedEvalData = eval_data

# Use the appropriate model to estimate the 5 outcome variables
for outcomeVarIndex in range(0, 5):
        
    # Grab selector and classifier
    selector = models[outcomeVarIndex][3]
    classifier = models[outcomeVarIndex][4]
        
    # Transform the input variables
    selectedSample = selector.transform(transformedEvalData)
        
    # Predict
    myScore.append(classifier.score(selectedSample,eval_labels[outcomeVarIndex]))
        
print(myScore)    

[0.62122203585477465, 0.35467539089637223, 0.70822872540118986, 0.94364298626380738, 0.85257083079361018]


In [13]:
scaler = Normalizer().fit(train_data)
normalizedTestData = scaler.transform(test_x)
#normalizedTestData = test_x

allPredictions = []

# Iterate through test samples
for sampleIndex in range(len(test_x)):
    
    sampleId = test_ids[sampleIndex]
    sample = normalizedTestData[sampleIndex]
    
    currentSamplePredictions = []
    
    # Use the appropriate model to estimate the 5 outcome variables
    for outcomeVarIndex in range(0, 5):
        
        # Grab selector and classifier
        selector = models[outcomeVarIndex][3]
        classifier = models[outcomeVarIndex][4]
        
        # Transform the input variables
        selectedSample = selector.transform(sample.reshape(1, -1))
        
        # Predict
        predicted = classifier.predict(selectedSample.reshape(1, -1))
        
        # Store
        currentSamplePredictions.append(predicted[0])
    
    allPredictions.append(currentSamplePredictions)
    

print(allPredictions)
       

[[-0.37661254480705231, -0.30763825587811122, -0.44294710961295936, -0.52851457155854265, 0.57477975792323166], [0.5092224628291071, -0.31317180139216938, 0.59545642469538462, -0.12506326637089127, -0.72000738362878969], [-0.30011569198230503, -0.26129481219787287, -0.032017197777520767, 0.04913013226720353, -0.90269340007351995], [0.04464046146229439, -0.31040502863514041, -0.35981053392782814, 0.80672765286021719, -0.36605322676712648], [-0.37661254480705231, -0.3041797899318246, -1.4214430310899222, -0.011121905454265502, -0.37366514411899021], [-0.37661254480705231, -0.37265741566829658, -0.53690502792164851, -0.87464705847220015, 0.28476570681722263], [-0.37661254480705231, -0.25576126668381438, -0.68133553204160124, 0.61393317248660018, -1.3160205122797199], [-0.37661254480705231, -0.25921973263010079, 0.73427756097558428, -0.11496707506914511, 0.3479446208376924], [-0.37661254480705231, 0.73889353946816994, 0.47518190361304802, -0.024353167641585238, -0.68879852248614892], [-0.1

**Generate CSV file for kaggle submission**

In [14]:
# Print header
header = 'PIDN,Ca,P,pH,SOC,Sand'
#np.savetxt('test.out', header, delimiter=',')  

filename = 'jds1.txt'
# Clean file
open(filename, 'w').close()
with open(filename, 'w') as f:
    f.write('PIDN,Ca,P,pH,SOC,Sand\n')  # python will convert \n to os.linesep

    # Iterate through test samples
    for i in range(len(allPredictions)):

        pred = allPredictions[i]
        testId = test_ids[i]
        text = testId + ',' + str(pred[0]) + ',' + str(pred[1]) + ',' + str(pred[2]) + ',' + str(pred[3]) + ',' + str(pred[4]) + '\n'
        f.write(text) 
    
f.close()

Investigation: Sand variable