In [46]:
import pandas as pd

In [47]:
#loading in the original data

df_training_og = pd.read_csv('../data/archive/train.csv')
df_testing_og = pd.read_csv('../data/archive/test.csv')

# PCA - Principal Component Analysis

In [48]:
#Start my importing the necessary library
from sklearn.decomposition import PCA

In [49]:
pca = PCA(n_components = 3)

In [50]:
#Now we have to extract only the independent variables 
x_train_pca = df_training_og.iloc[:,:-2]
x_test_pca = df_testing_og.iloc[:,:-2]

In [51]:
#Now let's fit our features to our pca model on both training and test sets
train_components = pca.fit_transform(x_train_pca)
test_components = pca.fit_transform(x_test_pca)

In [52]:
#Let's look at our pca features for training set 
pca_train_df = pd.DataFrame(data = train_components, columns = ['principal component 1', 'principal component 2',
                                                    'principal component 3'] )
pca_train_df.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-5.52028,-0.290276,-1.529931
1,-5.53535,-0.08253,-1.924805
2,-5.474988,0.287387,-2.144642
3,-5.677232,0.897031,-2.018218
4,-5.748749,1.162951,-2.139531


In [53]:
#Now for the test set
pca_test_df = pd.DataFrame(data = test_components, columns = ['principal component 1', 'principal component 2',
                                                    'principal component 3'] )
pca_test_df.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-2.754984,-1.387992,0.129415
1,-4.399115,-1.256753,-0.480537
2,-5.066335,-0.616208,-1.428891
3,-5.186594,-0.900994,-1.470104
4,-5.080981,-1.593048,-1.103903


In [54]:
# Before using these features to train our model, we should turn our categorical 'activity' variable to numerical

# function to convert activity to numeric
def convert_activity(activity):
    act_dict = {
        "LAYING": 1,
        "STANDING": 2,
        "SITTING": 3,
        "WALKING": 4,
        "WALKING_UPSTAIRS": 5,
        "WALKING_DOWNSTAIRS": 6
    }
    return act_dict.get(activity)

df_training_og = df_training_og.assign(
    Activity = df_training_og.get("Activity").apply(convert_activity)
)

In [55]:
#Now let's do the same for testing
df_testing_og = df_testing_og.assign(
    Activity = df_testing_og.get("Activity").apply(convert_activity)
)

In [56]:
# add the activity labels to the pca dataframes

pca_train_df = pca_train_df.assign(
    Activity = df_training_og.get("Activity")
)
pca_test_df = pca_test_df.assign(
    Activity = df_testing_og.get("Activity")
)

In [57]:
pca_train_df.to_csv(r'archive/train_pca.csv', index = False)
pca_test_df.to_csv(r'archive/test_pca.csv', index = False)