In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Import musk dataset
MUSK = pd.read_csv("MUSK_original.csv", header = None)
MUSK.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,167,168
0,MUSK-211,211_1+1,46,-108,-60,-69,-117,49,38,-161,...,-308,52,-7,39,126,156,-50,-112,96,1.0
1,MUSK-211,211_1+10,41,-188,-145,22,-117,-6,57,-171,...,-59,-2,52,103,136,169,-61,-136,79,1.0
2,MUSK-211,211_1+11,46,-194,-145,28,-117,73,57,-168,...,-134,-154,57,143,142,165,-67,-145,39,1.0
3,MUSK-211,211_1+12,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,136,168,-60,-135,80,1.0
4,MUSK-211,211_1+13,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,137,168,-60,-135,80,1.0


In [3]:
# Drop the first two columns
MUSK = MUSK.iloc[:,2:]
MUSK.columns = range(MUSK.shape[1])
print(MUSK.shape)
MUSK.head(5)

(6598, 167)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,46,-108,-60,-69,-117,49,38,-161,-8,5,...,-308,52,-7,39,126,156,-50,-112,96,1.0
1,41,-188,-145,22,-117,-6,57,-171,-39,-100,...,-59,-2,52,103,136,169,-61,-136,79,1.0
2,46,-194,-145,28,-117,73,57,-168,-39,-22,...,-134,-154,57,143,142,165,-67,-145,39,1.0
3,41,-188,-145,22,-117,-7,57,-170,-39,-99,...,-60,-4,52,104,136,168,-60,-135,80,1.0
4,41,-188,-145,22,-117,-7,57,-170,-39,-99,...,-60,-4,52,104,137,168,-60,-135,80,1.0


In [4]:
import random

# Reserve 70% of dataset for training 30% for testing
ratio = 0.7
train_index = random.sample(range(MUSK.shape[0]), k = int(MUSK.shape[0]*ratio))
MUSK_training = MUSK.iloc[train_index,:]
MUSK_testing = MUSK.drop(train_index, axis = 0)

print(MUSK_training.shape)
print(MUSK_testing.shape)

(4618, 167)
(1980, 167)


In [5]:
# Generalize the data with scale of training dataset
scaler = StandardScaler()
MUSK_training_X = scaler.fit_transform(MUSK_training.iloc[:,:-1])
MUSK_training_y = MUSK_training.iloc[:,-1]
MUSK_testing_X = scaler.transform(MUSK_testing.iloc[:,:-1])
MUSK_testing_y = MUSK_testing.iloc[:,-1]

In [6]:
# Concate the scaled data with label(y)
MUSK_training_X = pd.DataFrame(MUSK_training_X)
MUSK_testing_X = pd.DataFrame(MUSK_testing_X)
MUSK_training_X.loc[:,'label'] = MUSK_training_y.values
MUSK_testing_X.loc[:,'label'] = MUSK_testing_y.values

MUSK_training = MUSK_training_X
MUSK_testing = MUSK_testing_X

In [7]:
MUSK_training
# MUSK_testing.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,label
0,-0.340611,1.965448,1.312313,-0.987189,-0.181523,0.850239,-1.332595,0.714894,-1.188694,0.843800,...,0.400690,0.718805,1.142855,0.110872,-0.709369,0.231177,-0.406265,-0.683565,-0.722930,0.0
1,3.371691,-0.882461,-0.554242,-0.774977,-0.212008,-0.082077,0.344835,-1.037973,0.604456,0.287687,...,1.003935,-0.443456,-1.548580,-0.803181,0.467471,3.103148,-3.970749,-1.414804,0.805539,0.0
2,-0.491364,-0.838647,-0.510150,1.771576,-0.181523,-1.026823,1.978808,0.115813,0.502255,-0.185009,...,1.254513,-0.788994,-1.491619,-0.008352,-0.861666,1.260438,-0.370077,-1.574347,-0.628709,0.0
3,-0.227546,-0.816740,-0.995160,0.535749,0.153811,1.024271,0.562118,0.027060,0.381473,-0.254523,...,1.245233,-0.836112,0.416595,0.885830,0.218258,-0.150646,-0.442452,-0.364479,-0.314640,0.0
4,-0.434832,-0.214298,-0.877582,1.247285,-0.212008,-0.243678,0.622958,-0.860468,0.511546,-2.006278,...,-1.186309,-1.008881,0.046345,-0.554797,-0.072491,-0.017838,-0.116764,-0.032098,0.805539,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4613,1.562650,-0.827693,-1.053949,-0.862358,-0.212008,0.440020,0.614266,-1.870030,0.427928,0.718675,...,-1.084221,1.040783,1.057413,1.432275,2.350414,-1.013897,-0.225327,0.419940,0.062242,0.0
4614,0.073961,-0.608624,1.473983,-0.987189,-0.044341,0.763223,-0.993633,0.792553,-1.374512,0.815994,...,-1.019257,0.930839,0.459316,-0.405766,-0.390930,-0.200448,0.172738,0.380055,1.046325,0.0
4615,-1.056690,-0.794833,-0.760004,0.373468,-0.212008,-1.698091,0.553427,0.060342,1.951640,1.747483,...,-0.230398,-1.016734,-1.477378,-0.604473,-1.443163,-1.013897,-0.080576,0.539598,0.323967,0.0
4616,1.751092,-0.247158,1.547470,-0.650146,6.037397,0.228695,-1.193534,0.670518,-1.235148,0.315493,...,-0.016942,1.087902,0.032104,1.084537,1.464323,-0.383059,-0.351983,-0.178346,-0.681054,0.0


In [8]:
MUSK_training.to_csv("MUSK_training.csv", index = False)
MUSK_testing.to_csv("MUSK_testing.csv", index = False)