In [1]:
import utils
import numpy as np
import pandas as pd
from mathutils.geometry import intersect_point_line
from numpy.linalg import norm

print("Loading in data...")
# Loading data sets
PATH = r"./data/"
train, test = utils.load_small_data_csv(PATH,"train_smaller100.csv.gz","test_smaller100.csv.gz", utils.SIMPLE_FEATURE_COLUMNS)
print("Finished loading in data, starting feature generation for train dataset")
print("Starting kink")
#### TRAIN FEATURES ####

DataSet = train
Location_info = DataSet.loc[: , "MatchedHit_X[0]":"MatchedHit_Z[3]"]
countertrain = 0
countertest = 0

Loading in data...
Finished loading in data, starting feature generation for train dataset
Starting kink


In [30]:
PointResiduals = np.array([])
Angles = np.array([])
LineSlope1 = np.array([])
LineSlope2 = np.array([])
LineSlope3 = np.array([])
FourthPointResiduals = np.array([])
FirstPointResiduals = np.array([])
Shape = DataSet.shape[0]

train['PointResiduals'] = pd.Series(PointResiduals)
train['Angles'] = pd.Series(Angles)
train['LineSlope1'] = pd.Series(LineSlope1)
train['LineSlope2'] = pd.Series(LineSlope2)
train['LineSlope3'] = pd.Series(LineSlope3)
train['FirstPointResiduals'] = pd.Series(FirstPointResiduals)
train['FourthPointResiduals'] = pd.Series(FourthPointResiduals)

train.to_csv(r"./data" + r"/trainMoreFeatures" + ".csv")


In [31]:
for i in range(0,DataSet.shape[0]):
    ResidualsSize = 0

    if i % 10000 == 0:
        print(" Evaluated " + str(i) + " features out of a total of " + str(Shape)) 

    LineSlope = np.array([])
    # Extracting info on the i-th particle's coordinates
    Particle_Path_Points = Location_info.loc[i,:]
    X = Particle_Path_Points.loc['MatchedHit_X[0]':'MatchedHit_X[3]'].values
    Y = Particle_Path_Points.loc['MatchedHit_Y[0]':'MatchedHit_Y[3]'].values
    Z = Particle_Path_Points.loc['MatchedHit_Z[0]':'MatchedHit_Z[3]'].values

    data = np.concatenate((X[:, np.newaxis], 
                        Y[:, np.newaxis], 
                        Z[:, np.newaxis]), 
                        axis=1)

    datamean = data.mean(axis=0)

    # uu, dd and vv contain information on the fit. In fact, vv[0] contains the direction of the best fit (least squares)
    uu, dd, vv = np.linalg.svd(data - datamean)

    # Best fit line with length between -2500 and 2500 with 2 datapoints (it's a straight line so that's enough)
    linepts = vv[0] * np.mgrid[-2500:2500:2j][:, np.newaxis]
    LineSlope = vv[0]
    LineSlope1 = np.append(LineSlope1, LineSlope[0])
    LineSlope2 = np.append(LineSlope2, LineSlope[1])
    LineSlope3 = np.append(LineSlope3, LineSlope[2])


    # Shift by the mean to get the line in the right place (centered)
    linepts += datamean

    # Do a little loop where we add the residuals of all four of the points
    for j in range (0,4):
        intersect = intersect_point_line(data[j], linepts[0], linepts[1])
        ResidualsSize += abs(sum(data[j] - intersect[0]))

    PointResiduals = np.append(PointResiduals, ResidualsSize)

    # Singular value decomposition on first line to get its derivative (angle)
    DataFirstTwo = data[0:2,:]
    datamean = DataFirstTwo.mean(axis=0)
    uu, dd, vv = np.linalg.svd(DataFirstTwo - datamean)
    FirstLineAngle = vv[0]


    # Singular value decomposition on second line to get its derivative (angle)
    DataSecondTwo = data[2:4,:]    
    datamean = DataSecondTwo.mean(axis=0)
    uu, dd, vv = np.linalg.svd(DataSecondTwo - datamean)
    SecondLineAngle = vv[0]

    # Finding the angle between the vectors made up by the first and second line. Simply dot product as both lines are normalized (norm=1 for both)
    Angle = np.dot(FirstLineAngle,SecondLineAngle)
    Angles = np.append(Angles, Angle)

    # Singular value decomposition for first three points to get residuals with the fourth point
    DataFirstThree = data[0:3,:]
    datamean = DataFirstThree.mean(axis=0)
    uu, dd, vv = np.linalg.svd(DataFirstThree - datamean)

    linepts = vv[0] * np.mgrid[-2500:2500:2j][:, np.newaxis]
    linepts += datamean

    intersect = intersect_point_line(data[3], linepts[0], linepts[1])
    ResidualSize = abs(norm(data[3] - intersect[0]))

    FourthPointResiduals = np.append(FourthPointResiduals, ResidualSize)

    # Singular value decomposition for last three points to get residuals with the first point
    DataLastThree = data[1:4,:]
    datamean = DataLastThree.mean(axis=0)
    uu, dd, vv = np.linalg.svd(DataLastThree - datamean)

    linepts = vv[0] * np.mgrid[-2500:2500:2j][:, np.newaxis]
    linepts += datamean

    intersect = intersect_point_line(data[0], linepts[0], linepts[1])
    ResidualSize = abs(norm(data[0] - intersect[0]))

    FirstPointResiduals = np.append(FirstPointResiduals, ResidualSize)

 Evaluated 0 features out of a total of 54457
 Evaluated 10000 features out of a total of 54457
 Evaluated 20000 features out of a total of 54457
 Evaluated 30000 features out of a total of 54457
 Evaluated 40000 features out of a total of 54457
 Evaluated 50000 features out of a total of 54457


In [32]:
countertrain = 5
DataSet = train
i = DataSet.shape[0]-1

df = pd.DataFrame([],columns = ["PointResiduals", "Angles", "LineSlope1", "LineSlope2", "LineSlope3", "FirstPointResiduals", "FourthPointResiduals"])
df['PointResiduals'] = pd.Series(PointResiduals)
df['Angles'] = pd.Series(Angles)
df['LineSlope1'] = pd.Series(LineSlope1)
df['LineSlope2'] = pd.Series(LineSlope2)
df['LineSlope3'] = pd.Series(LineSlope3)
df['FirstPointResiduals'] = pd.Series(FirstPointResiduals)
df['FourthPointResiduals'] = pd.Series(FourthPointResiduals)

with open('./data/trainMoreFeatures.csv', 'a') as f:
    df.to_csv(f, header=False)

In [None]:

df = pd.DataFrame([])
PointResiduals = np.array([])
Angles = np.array([])
LineSlope1 = np.array([])
LineSlope2 = np.array([])
LineSlope3 = np.array([])
FourthPointResiduals = np.array([])
FirstPointResiduals = np.array([])

In [33]:
df

Unnamed: 0,PointResiduals,Angles,LineSlope1,LineSlope2,LineSlope3,FirstPointResiduals,FourthPointResiduals
0,246.806705,0.988891,-0.238813,0.087942,0.967075,104.887137,220.790997
1,181.185415,0.989089,0.002640,0.154513,0.987987,167.090617,135.569301
2,62.838605,0.999798,-0.137054,0.031446,0.990064,57.263219,51.634307
3,23.832940,0.999801,-0.098664,-0.012149,0.995047,19.497367,20.931289
4,33.412034,0.999820,-0.121419,0.061508,0.990694,9.669819,28.996228
5,34.771036,0.999820,-0.153995,0.020458,0.987860,9.563666,29.126648
6,26.440972,0.999997,-0.051151,0.138927,0.988981,11.923120,8.233750
7,53.100587,0.999467,0.007788,-0.020708,0.999755,48.867185,30.578751
8,39.209456,0.999295,0.153533,-0.069402,0.985703,32.003179,44.687656
9,12.498796,0.999997,0.041325,0.110299,0.993039,7.185044,2.864015


In [29]:
LineSlope1

array([], dtype=float64)