#Random Forest for Star-Galaxy Classification using .fits
This is nearly identical to stargalaxy, but is specific to the .fits data

The following code allows us to use PySpark in the iPython Notebook 

In [1]:
import os
import sys

# Set the path for spark installation
# this is the path where you have built spark using sbt/sbt assembly
os.environ['SPARK_HOME']="/Users/blorangest/Desktop/spark-1.3.1-bin-hadoop2.6"
# Append to PYTHONPATH so that pyspark could be found
sys.path.append("/Users/blorangest/Desktop/spark-1.3.1-bin-hadoop2.6/python")
sys.path.append(os.path.join(os.environ['SPARK_HOME'], 'python/lib/py4j-0.8.2.1-src.zip'))
sys.path.append("/Library/Python/2.7/site-packages") #gives the location of pyfits and other python modules on local machine 


# Now we are ready to import Spark Modules
try:
    from pyspark.mllib.tree import RandomForest
    from pyspark.mllib.tree import DecisionTreeModel
    from pyspark.mllib.util import MLUtils
    from pyspark.mllib.regression import LabeledPoint
    from pyspark import SparkContext

except ImportError as e:
    print ("Error importing Spark Modules", e)
    sys.exit(1)
import numpy as np
import pyfits
import shutil

Now we set some variables that will determine the properties of the random forest. test_size is the percentage of the data that will be used to test the model. num_trees is the number of trees in the forest. max_depth is the maximum depth of each tree. It must be no more than 30. k is the number of folds desired for kfolds cross validation

In [2]:
dataFile = "./round4_training_set.fits"
test_size = 0.2
num_trees = 50
max_depth = 8
k = 5

This function saves a given RDD as a text file

In [3]:
def save (rdd, filename):
    try:
        shutil.rmtree(filename)
    except Exception:
        pass
    rdd.saveAsTextFile(filename)

This function is a slow way to get classification probabilities and number of trees that classify it as a star

In [4]:
def get_probs (model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data.map(lambda x: x.features))

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1,ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        scores = scores.map(lambda x: x[0] + x[1])
    
    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x/ntrees), scores

Compute test error by thresholding probabilistic predictions

In [5]:
def probTest(testData, model):
    threshold = 0.5
    probsAndScores = get_probs(model,testData)
    probs = probsAndScores[0]
    pred = probs.map(lambda x: 0 if x < threshold else 1)
    lab_pred = testData.map(lambda lp: lp.label).zip(pred)
    acc = lab_pred.filter(lambda (v, p): v != p).count() / float(testData.count())
    return (1 - acc), probsAndScores[1]

Tests the random forest classifier once

In [7]:
def testOnce(data):
    # split the data into training and testing sets
    (trainingData, testData) = data.randomSplit([1-test_size, test_size])
     # train the random forest
    model = RandomForest.trainClassifier(trainingData, numClasses=3, categoricalFeaturesInfo={},
                                         numTrees=num_trees, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth = max_depth, maxBins=32)
    # test the random forest
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
    Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
    Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
    Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
    probsAndScores = probTest(testData, model)
    threshold_accuracy = probsAndScores[0]
    probs = probsAndScores[1].map(lambda x: x/num_trees)
    labelsAndPredictions = labelsAndPredictions.zip(probs)
    save(labelsAndPredictions, 'answers')
    print ('Galaxy Purity = ' + str(Ng / (Ng+Ms)))
    print ('Galaxy Completeness = ' + str(Ng / (Ng+Mg)))
    print ('Star Purity = ' + str(Ns / (Ns+Mg)))
    print ('Star Completeness = ' + str(Ns/(Ns+Ms)))
    print ('Accuracy = ' + str(1 - testErr))
    print ('Threshold method accuracy = ' + str(threshold_accuracy))

Performs k folds cross-validation

In [8]:
def kfolds (data):
    #folds = kFold(data, k) this would work in java
    acc = 0
    spurity = 0
    scomp = 0
    gpurity = 0
    gcomp = 0
    foldsize = data.count()/k
    tested = sc.parallelize([])
    for i in range(k):
        test = sc.parallelize(data.subtract(tested).takeSample(False, foldsize))
        tested = tested.union(test)
        train = data.subtract(test)
        # train the random forest
        model = RandomForest.trainClassifier(train, numClasses=3, categoricalFeaturesInfo={},
                                     numTrees=num_trees, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth = max_depth, maxBins=32)

        predictions = model.predict(test.map(lambda x: x.features))
        labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
        testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
        Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
        Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
        Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
        Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
        
        gpurity += (Ng / (Ng+Ms))
        gcomp += (Ng / (Ng+Mg))
        spurity += (Ns / (Ns+Mg))
        scomp += (Ns/(Ns+Ms))
        acc += (1 - testErr)
    
    print 'with '+ str(k) + ' folds:'
    print ('Average Galaxy Purity = ' + str(gpurity / k))
    print ('Average Galaxy Completeness = ' + str(gcomp / k))
    print ('Average Star Purity = ' + str(spurity / k))
    print ('Average Star Completeness = ' + str(scomp / k))
    print ('Average Accuracy = ' + str(acc / k))
            

Gets HDUlist of data from file

In [9]:
sc = SparkContext(appName="stargalaxy")
x = pyfits.open(dataFile)
headers = x[1].columns.names
data = x[1].data

Gets magnitudes from HDUlist and puts them in a matrix represented by nested lists

In [10]:
magHeads = ['MAG_AUTO_G','MAG_AUTO_R','MAG_AUTO_I','MAG_AUTO_Z','MAG_AUTO_Y']
#magIdx = {m: m.index(m) for m in mag}
mags = []
for i in range(len(data)):
    mags.append([])
for m in magHeads:
    x = data[m]
    for i in range(len(data)):
        mags[i].append(x[i])

Gets the rest of the wanted features and puts them in a matrix represented by nested lists

In [11]:
want = ['WAVG_SPREAD_MODEL_G','WAVG_SPREAD_MODEL_R','WAVG_SPREAD_MODEL_I','WAVG_SPREAD_MODEL_Z','WAVG_SPREAD_MODEL_Y','A_IMAGE','B_IMAGE']
rawdata = []
labels = data['TRUE_CLASS'].tolist()
for i in range(len(data)):
    rawdata.append([])
for w in want:
    x = data[w]
    for i in range(len(data)):
        rawdata[i].append(x[i])

Calculate and add color to the nested list of features

In [12]:
for i in range(len(data)):
    for j in range(len(magHeads)-1):
         rawdata[i].append(mags[i][j]-mags[i][j+1])

Reformat the data as a list of LabeledPoints and then reformat that list into an RDD

In [13]:
data = []
for i in range(len(rawdata)):
    data.append(LabeledPoint(labels[i], rawdata[i]))
data = sc.parallelize(data)

Test the classifier

In [14]:
testOnce(data)

Galaxy Purity = 0.964799776949
Galaxy Completeness = 0.996364814282
Star Purity = 0.958708094849
Star Completeness = 0.698956780924
Accuracy = 0.964290301863
Threshold method accuracy = 0.964290301863
