In [1]:
# Import the libraries

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

### Loading the datasets

In [2]:
# Read csv files

df_features = pd.read_csv('train_features.csv', index_col=0)
df_targets = pd.read_csv('train_targets_scored.csv', index_col=0)

# Print the shapes
df_features.shape, df_targets.shape

((23814, 875), (23814, 206))

In [3]:
# Take a peek at features
df_features.head()

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


**Plans**
1. Remove controls in both features and targets to reduce the background noise. Since all the control has no labels, it is easy to predict. 
2. Store the index.
3. Drop the categorical variables. 
4. Scale the continuous varibles by StandardScaler. 
5. Convert back to dataframe.

### Prepare the features for dimentionality reduction

#### 1. Remove the controls

In [4]:
# Create a mask for the treated
mask_trt = (df_features.cp_type == 'trt_cp')

# Cout the number of the treated
mask_trt.sum()

21948

In [5]:
# Remove the controls in features
df_features_trt = df_features[mask_trt]

# Print the shape of the new dataframe
df_features_trt.shape

(21948, 875)

In [6]:
# Remove the controls in targets
df_targets_trt = df_targets[mask_trt]

# Print the shape of the new targets df
df_targets_trt.shape

(21948, 206)

In [7]:
# Check if the index in both dataframes match
(df_features_trt.index == df_targets_trt.index).sum()

21948

#### 2. Store the index

In [8]:
# Store the trt index as a list
sig_id = df_features_trt.index.to_list()

# Print the size fo the list
len(sig_id) # Match 21948

21948

#### 3. Drop the categoricals

In [9]:
# Count the unique values in df_features_trt
df_features_trt.cp_type.value_counts() # Double check

trt_cp    21948
Name: cp_type, dtype: int64

In [10]:
# Drop the categricals
df_features_trt.drop(columns=['cp_type', 'cp_time', 'cp_dose'], inplace=True)

# Print the shape
df_features_trt.shape

(21948, 872)

#### 4. Scale the continuous

In [11]:
# Create the scaler

scaler = StandardScaler()
scaler

StandardScaler()

In [12]:
# Scale the rest of the features
X_scaled = scaler.fit_transform(df_features_trt)

# Print the data type of X_scaled
print(type(X_scaled))

# The length of the array should match the number of observations
print(len(X_scaled)) # 21948, matched

# The length of every member in the array shoulb math the number of the features
print(len(X_scaled[0])) # 872, matched

<class 'numpy.ndarray'>
21948
872


#### 5. Convert to dataframe

In [13]:
# Restore the index
df_X_scaled = pd.DataFrame(X_scaled, index=sig_id)

# Print the shape 
df_X_scaled.shape

(21948, 872)

In [14]:
# Take a peek at the scaled features
df_X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,862,863,864,865,866,867,868,869,870,871
id_000644bb2,0.549598,0.795008,-0.401914,-0.735073,-0.27302,-0.734098,-1.187307,0.151066,0.441443,-0.178297,...,0.391971,0.365039,0.660978,0.514686,0.171632,0.631496,0.055513,0.363935,0.484324,0.531958
id_000779bfc,-0.14251,0.609207,0.126195,-0.021846,0.912782,0.561238,0.254855,0.476366,-0.091567,0.677683,...,0.04671,0.600043,0.497959,0.271068,0.39041,0.531001,0.314322,0.312714,0.604833,0.753503
id_000a6266a,0.245482,0.824935,1.337754,-0.165078,-0.114606,1.168297,0.182982,0.369311,0.136858,1.022054,...,-0.097895,-0.055827,0.56548,0.270746,-0.337366,0.049518,-0.059054,0.117185,-0.462432,0.722993
id_0015fd391,-0.554608,-0.211058,-0.419002,0.468577,3.886571,-0.562958,-2.263102,0.337379,0.059864,-1.057065,...,-0.763516,-0.062651,-2.454968,-0.373763,-0.130316,-0.529359,-0.519624,-0.2591,0.077845,-0.323019
id_001626bd3,-0.422591,-0.40035,0.773924,0.639345,1.302707,-0.575551,-0.335397,0.047969,-0.354989,0.68885,...,0.255359,0.244859,0.592924,0.752505,0.505707,0.058046,0.294193,0.406486,0.082611,0.736445


In [15]:
# Check the mean of for features
df_X_scaled.mean(axis=0)[:5] # Should be zero for all the features

0   -2.879761e-17
1   -1.084526e-17
2   -3.841873e-18
3    7.466235e-18
4   -1.919166e-17
dtype: float64

In [16]:
# Checkt the variance of the scaled dataset
df_X_scaled.var(axis=0)[:5] # Checked: should be 1 from begining to the end

0    1.000046
1    1.000046
2    1.000046
3    1.000046
4    1.000046
dtype: float64