In [1]:
import findspark
findspark.init("/home/wesmail/Panda/spark-2.2.1-bin-hadoop2.7")

from pyspark.sql import SparkSession

spark = SparkSession.builder  \
.master("local[2]")  \
.appName("Linear Regrssion Model")  \
.config("spark.executor.memory","8gb")  \
.getOrCreate()

sc = spark.sparkContext

# LOAD HERE
from root_numpy import root2array, tree2array, rec2array
import numpy as np
import pandas as pd

branch_names = """momentum,energy,position,MvdDEDX,MvdHits,SttMeanDEDX,SttHits,GemHits,TofStopTime,
TofM2,TofTrackLength,TofQuality,TofBeta,DrcThetaC,DrcThetaCErr,DrcQuality,DrcNumberOfPhotons,
DiscThetaC,DiscQuality,DiscNumberOfPhotons,
RichThetaC,RichQuality,RichNumberOfPhotons,
EmcRawEnergy,EmcCalEnergy,EmcQuality,EmcNumberOfCrystals,EmcNumberOfBumps,EmcModule,
EmcZ20,EmcZ53,EmcLat,EmcE1,EmcE9,EmcE25,MuoProbability,MuoQuality,MuoIron,MuoMomentumIn,MuoNumberOfLayers,MuoModule,MuoHits,
DegreesOfFreedom,FitStatus,ChiSquared""".split(",")
branch_names = [c.strip() for c in branch_names]
branch_names = list(branch_names)

electrons = root2array("/home/wesmail/Downloads/treeElectrons.root", "t1", branch_names)
electrons = rec2array(electrons)

pions = root2array("/home/wesmail/Downloads/treePions.root", "t1", branch_names)
pions = rec2array(pions)

muons = root2array("/home/wesmail/Downloads/treeMuons.root", "t1", branch_names)
muons = rec2array(muons)

kaons = root2array("/home/wesmail/Downloads/treeKaons.root", "t1", branch_names)
kaons = rec2array(kaons)

anti_p = root2array("/home/wesmail/Downloads/treeProtons.root", "t1", branch_names)
anti_p = rec2array(anti_p)

X = np.concatenate((electrons, pions, muons, kaons, anti_p))
y = np.concatenate(( np.zeros(electrons.shape[0]), (np.ones(pions.shape[0])), (2*np.ones(muons.shape[0])), (3*np.ones(kaons.shape[0])), (4*np.ones(anti_p.shape[0])) ))

# Create DataFrame from X and y
df = pd.DataFrame(np.hstack((X, y.reshape(y.shape[0], -1))),columns=branch_names+['label'])

# Create Spark DataFrame
sparkRDD = spark.createDataFrame(df)

Welcome to ROOTaaS 6.06/02


In [2]:
# Assemble the branches into feature coloumn
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=branch_names,outputCol="features")
data = assembler.transform(sparkRDD)

In [3]:
from pyspark.ml.classification import RandomForestClassifier

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=0)


# Train a RandomForest model.
dt = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=20,  maxDepth=15)

# Fit model.
model = dt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
predictionAndLabels = predictions.select("prediction", "label")


In [4]:
# Convert 'predictionAndLabels' into spark RDD
mypred = predictionAndLabels.rdd
print (mypred)

# Evaluation metrics
from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(mypred)

# Build the confusion matrix
cm = metrics.confusionMatrix().toArray()

# Show the confusion matrix
print (cm)

# Normalize and plot the confusion matrix (Rows = True values, Coloumns = Predicted Values)
# normalized over coloumns (axis=1)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib tk
figcm, ax = plt.subplots()
cm = cm.astype('float') / cm.sum(axis=1)
sns.set(font_scale=2)
sns.heatmap(cm, square=True, annot=True, cbar=False)
classes=['e-','pi-', 'mu-', 'k-', 'p-']    # 0 1 2 3 4 
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=90)
ax.set_xticks(np.arange(len(classes))+0.5, minor=False)
plt.yticks(tick_marks, classes)
ax.set_yticks(np.arange(len(classes))+0.5, minor=False)
plt.xlabel('predication', horizontalalignment = 'center')
plt.ylabel('true value')

MapPartitionsRDD[72] at javaToPython at NativeMethodAccessorImpl.java:0
[[9987.  123.   65.   73.   59.]
 [ 205. 7501. 1138.  892.  376.]
 [ 124.  807. 9242.  210.   27.]
 [ 117. 1093.  408. 7179.  987.]
 [ 118.  275.  108. 1044. 7984.]]


Text(113.922,0.5,u'true value')