In [1]:
## data manipulation
import numpy as np
import pandas as pd
from string import ascii_letters

## visualization
import seaborn as sns
import scipy.stats as ss
import matplotlib.pyplot as plt

## preprocessing
from sklearn.preprocessing import OneHotEncoder

## modelling
from sklearn.metrics import accuracy_score

In [2]:
train_raw = pd.read_csv('input/CSE7302c_train-1539668060821.csv', na_values='?')
train_raw.head()

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b.tag,...,jet4phi,jet4b.tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb,id
0,1,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343,1
1,1,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118,2
2,0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904,3
3,0,1.595839,-0.607811,0.007075,1.81845,-0.111906,0.84755,-0.566437,1.581239,2.173076,...,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818,4
4,1,0.409391,-1.884684,-1.027292,1.672452,-1.604598,1.338015,0.055427,0.013466,2.173076,...,1.37713,3.101961,0.869418,1.222083,1.000627,0.545045,0.698653,0.977314,0.828786,5


In [3]:
test_raw = pd.read_csv('input/CSE7302c_test-1539668060821.csv', na_values='?')
test_raw.head()

Unnamed: 0,class,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b.tag,...,jet4phi,jet4b.tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb,id
0,1,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487,68637
1,1,1.176566,0.104161,1.397002,0.479721,0.265513,1.135563,1.534831,-0.253291,0.0,...,0.530334,0.0,0.833175,0.773968,0.98575,1.103696,0.84914,0.937104,0.812364,68638
2,1,1.384098,0.116822,-1.179879,0.762913,-0.079782,1.019863,0.877318,1.276887,2.173076,...,0.504809,3.101961,0.959325,0.807376,1.191814,1.22121,0.861141,0.929341,0.838302,68639
3,1,1.33011,0.201557,1.173393,0.135309,-1.083159,0.728461,1.109032,-0.539903,1.086538,...,0.054775,3.101961,0.782073,0.672103,0.990002,0.734279,0.379074,0.76476,0.642924,68640
4,0,1.289848,-1.423023,-0.687162,0.131103,-1.135935,0.82135,0.296053,0.167582,2.173076,...,0.727328,0.0,1.006643,0.867914,0.99909,1.109608,1.125094,0.882899,0.859396,68641


In [4]:
train_raw.nunique()

class                           2
lepton_pT                   18514
lepton_eta                   8919
lepton_phi                  11806
missing_energy_magnitude    66854
missing_energy_phi          66804
jet1pt                      25954
jet1eta                      9422
jet1phi                     11713
jet1b.tag                       5
jet2pt                      22270
jet2eta                      9658
jet2phi                     11827
jet2b.tag                       5
jet3pt                      17882
jet3eta                     10089
jet3phi                     11753
jet3b.tag                       5
jet4pt                      13935
jet4eta                     10516
jet4phi                     11801
jet4b.tag                       5
m_jj                        65748
m_jjj                       57372
m_lv                        42247
m_jlv                       60935
m_bb                        62851
m_wbb                       63301
m_wwbb                      64483
id            

In [5]:
test_raw.nunique()

class                           2
lepton_pT                   13786
lepton_eta                   7895
lepton_phi                  10476
missing_energy_magnitude    29132
missing_energy_phi          29138
jet1pt                      17678
jet1eta                      7838
jet1phi                     10383
jet1b.tag                       5
jet2pt                      15784
jet2eta                      8145
jet2phi                     10507
jet2b.tag                       5
jet3pt                      13492
jet3eta                      8498
jet3phi                     10487
jet3b.tag                       5
jet4pt                      10920
jet4eta                      8978
jet4phi                      6233
jet4b.tag                       3
m_jj                        28628
m_jjj                       26074
m_lv                        20085
m_jlv                       27166
m_bb                        27697
m_wbb                       27977
m_wwbb                      28235
id            

In [7]:
## drop loast row since that has all the NA values
test_raw.drop(test_raw.index[[test_raw.shape[0]-1]], inplace=True)
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29413 entries, 0 to 29412
Data columns (total 30 columns):
class                       29413 non-null int64
lepton_pT                   29413 non-null float64
lepton_eta                  29413 non-null float64
lepton_phi                  29413 non-null float64
missing_energy_magnitude    29413 non-null float64
missing_energy_phi          29413 non-null float64
jet1pt                      29413 non-null float64
jet1eta                     29413 non-null float64
jet1phi                     29413 non-null float64
jet1b.tag                   29413 non-null float64
jet2pt                      29413 non-null float64
jet2eta                     29413 non-null float64
jet2phi                     29413 non-null float64
jet2b.tag                   29413 non-null float64
jet3pt                      29413 non-null float64
jet3eta                     29413 non-null float64
jet3phi                     29413 non-null float64
jet3b.tag                 

In [6]:
test_raw.isna().sum()

class                       0
lepton_pT                   0
lepton_eta                  0
lepton_phi                  0
missing_energy_magnitude    0
missing_energy_phi          0
jet1pt                      0
jet1eta                     0
jet1phi                     0
jet1b.tag                   0
jet2pt                      0
jet2eta                     0
jet2phi                     0
jet2b.tag                   0
jet3pt                      0
jet3eta                     0
jet3phi                     0
jet3b.tag                   0
jet4pt                      0
jet4eta                     0
jet4phi                     1
jet4b.tag                   1
m_jj                        1
m_jjj                       1
m_lv                        1
m_jlv                       1
m_bb                        1
m_wbb                       1
m_wwbb                      1
id                          0
dtype: int64

In [8]:
## we know that 
# train size:  68636  observations  30 attributes
# test size:   29414 observations 30 attributes

# We have 68636 and 29414 unique ids in train and test respectively and this nominal data can be removed.
# Let's also remove target variable
train_data = train_raw.drop(['id', 'class'], axis=1)
test_data = test_raw.drop(['id', 'class'], axis=1)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_raw['class'], test_size=0.3, random_state=42)
X_test, y_test = test_data, test_raw['class'].astype('category')

In [10]:
print(f'shape of X_train: {X_train.shape}')
print(f'shape of X_valid: {X_valid.shape}')
print(f'shape of X_test: {X_test.shape}')

shape of X_train: (48045, 28)
shape of X_valid: (20591, 28)
shape of X_test: (29413, 28)


In [11]:
cat_cols = X_train.columns[X_train.nunique() <= 5]
print(cat_cols)
num_cols = X_train.columns[X_train.nunique() > 5]
num_cols

Index(['jet1b.tag', 'jet2b.tag', 'jet3b.tag', 'jet4b.tag'], dtype='object')


Index(['lepton_pT', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude',
       'missing_energy_phi', 'jet1pt', 'jet1eta', 'jet1phi', 'jet2pt',
       'jet2eta', 'jet2phi', 'jet3pt', 'jet3eta', 'jet3phi', 'jet4pt',
       'jet4eta', 'jet4phi', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb',
       'm_wwbb'],
      dtype='object')

In [12]:
## type casting
# train
X_train[cat_cols] = X_train[cat_cols].astype('category')
# validation
X_valid[cat_cols] = X_valid[cat_cols].astype('category')
# test
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [13]:
## LabelEncoder
onehotencoder = OneHotEncoder(handle_unknown='ignore')

## Fit method
onehotencoder = onehotencoder.fit(X_train[cat_cols])

## Get names for new columns
ohe_cat_col_names = onehotencoder.get_feature_names(cat_cols)

## Print the above columns names
ohe_cat_col_names

array(['jet1b.tag_0.0', 'jet1b.tag_1.086538',
       'jet1b.tag_1.0865380759999999', 'jet1b.tag_2.173076',
       'jet1b.tag_2.1730761530000002', 'jet2b.tag_0.0',
       'jet2b.tag_1.107436', 'jet2b.tag_1.107436061',
       'jet2b.tag_2.214872', 'jet2b.tag_2.214872122', 'jet3b.tag_0.0',
       'jet3b.tag_1.2741120000000001', 'jet3b.tag_1.274112225',
       'jet3b.tag_2.5482240000000003', 'jet3b.tag_2.548224449',
       'jet4b.tag_0.0', 'jet4b.tag_1.5509806869999998',
       'jet4b.tag_1.550981', 'jet4b.tag_3.101961',
       'jet4b.tag_3.1019613739999996'], dtype=object)

In [14]:
## Encode X_train category columns 
X_train_cat_onehotencoded = onehotencoder.transform(X_train[cat_cols]).toarray()
X_train_cat_onehotencoded = pd.DataFrame(X_train_cat_onehotencoded, columns=ohe_cat_col_names)
print(f'shape of X_train_cat_onehotencoded: {X_train_cat_onehotencoded.shape}')
X_train_cat_onehotencoded.head()

shape of X_train_cat_onehotencoded: (48045, 20)


Unnamed: 0,jet1b.tag_0.0,jet1b.tag_1.086538,jet1b.tag_1.0865380759999999,jet1b.tag_2.173076,jet1b.tag_2.1730761530000002,jet2b.tag_0.0,jet2b.tag_1.107436,jet2b.tag_1.107436061,jet2b.tag_2.214872,jet2b.tag_2.214872122,jet3b.tag_0.0,jet3b.tag_1.2741120000000001,jet3b.tag_1.274112225,jet3b.tag_2.5482240000000003,jet3b.tag_2.548224449,jet4b.tag_0.0,jet4b.tag_1.5509806869999998,jet4b.tag_1.550981,jet4b.tag_3.101961,jet4b.tag_3.1019613739999996
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [15]:
## Encode X_valid category columns 
X_valid_cat_onehotencoded = onehotencoder.transform(X_valid[cat_cols]).toarray()
X_valid_cat_onehotencoded = pd.DataFrame(X_valid_cat_onehotencoded, columns=ohe_cat_col_names)
print(f'shape of X_valid_cat_onehotencoded: {X_valid_cat_onehotencoded.shape}')
X_valid_cat_onehotencoded.head()

shape of X_valid_cat_onehotencoded: (20591, 20)


Unnamed: 0,jet1b.tag_0.0,jet1b.tag_1.086538,jet1b.tag_1.0865380759999999,jet1b.tag_2.173076,jet1b.tag_2.1730761530000002,jet2b.tag_0.0,jet2b.tag_1.107436,jet2b.tag_1.107436061,jet2b.tag_2.214872,jet2b.tag_2.214872122,jet3b.tag_0.0,jet3b.tag_1.2741120000000001,jet3b.tag_1.274112225,jet3b.tag_2.5482240000000003,jet3b.tag_2.548224449,jet4b.tag_0.0,jet4b.tag_1.5509806869999998,jet4b.tag_1.550981,jet4b.tag_3.101961,jet4b.tag_3.1019613739999996
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
## Encode X_test category columns 
X_test_cat_onehotencoded = onehotencoder.transform(X_test[cat_cols]).toarray()
X_test_cat_onehotencoded = pd.DataFrame(X_test_cat_onehotencoded, columns=ohe_cat_col_names)
print(f'shape of X_test_cat_onehotencoded: {X_test_cat_onehotencoded.shape}')
X_test_cat_onehotencoded.head()

shape of X_test_cat_onehotencoded: (29413, 20)


Unnamed: 0,jet1b.tag_0.0,jet1b.tag_1.086538,jet1b.tag_1.0865380759999999,jet1b.tag_2.173076,jet1b.tag_2.1730761530000002,jet2b.tag_0.0,jet2b.tag_1.107436,jet2b.tag_1.107436061,jet2b.tag_2.214872,jet2b.tag_2.214872122,jet3b.tag_0.0,jet3b.tag_1.2741120000000001,jet3b.tag_1.274112225,jet3b.tag_2.5482240000000003,jet3b.tag_2.548224449,jet4b.tag_0.0,jet4b.tag_1.5509806869999998,jet4b.tag_1.550981,jet4b.tag_3.101961,jet4b.tag_3.1019613739999996
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [18]:
X_train = pd.concat([X_train[num_cols], X_train_cat_onehotencoded]], axis=1)
X_train.shape

SyntaxError: invalid syntax (<ipython-input-18-164549d3d1df>, line 1)

In [18]:
X_valid = pd.concat([X_valid_cat_onehotencoded, X_valid[num_cols]], axis=1)
X_valid.shape

(35054, 44)

In [19]:
X_test = pd.concat([X_test_cat_onehotencoded, X_test[num_cols]], axis=1)
X_test.shape

(29413, 44)