In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import os
from sklearn.model_selection import KFold, cross_val_score

In [3]:
def fix_col_name(col_split):
        new_string = col_split[1].replace('(','')
        new_string = new_string.replace(')','')
        new_string = new_string.replace(',','-')
        new_string = new_string.replace('BodyBody','Body')
        new_string = new_string.replace('Body','')
        new_string = new_string.replace('Mag','')
        new_string = new_string.replace('mean','Mean')
        new_string = new_string.replace('std','STD')
        return(new_string)

In [4]:
def get_data(col_list):
    # reads file into df.  Modify to pass location for train and test.
    cwd = os.getcwd()

    test_file = os.path.normpath(cwd + '/test/X_test.txt')

    dftest = pd.read_table(test_file, header = None, delim_whitespace=True, names = col_list)

    train_file = os.path.normpath(cwd + '/train/X_train.txt')

    dftrain = pd.read_table(train_file, header = None, delim_whitespace=True, names = col_list)

    # df.append(df2)
    xdata = dftest.append(dftrain)
    
    # get y data
    test_file = os.path.normpath(cwd + '/test/y_test.txt')

    dftest = pd.read_table(test_file, header = None, delim_whitespace=True, names = ['activity'])

    train_file = os.path.normpath(cwd + '/train/y_train.txt')

    dftrain = pd.read_table(train_file, header = None, delim_whitespace=True, names = ['activity'])
    ydata = dftest.append(dftrain)
    
    return(xdata, ydata)

In [5]:
def trans_columns():
    ###  get's column names from feature_file should be same for train and test data.  Can use same col_list
    col_list =[]
    x = 0
    with open('features.txt','r+') as feature_file:
        for line in feature_file: # one line is one column
    #         print(line)
            col_split = line.split() # splits the numbers and characters from the column into seperate fields to work with
    #         print(col_split)
            new_col = fix_col_name(col_split)
    #         print(new_col)
            col_list.append(col_split[0] + new_col)
    #     col_list = remove_dups(col_list)
    #         break # stop at one record
    return(col_list)


In [6]:
def drop_columns(df):
    drop_list = ['angle','band']
    for col in df.columns.values:
        for item in drop_list:
            test = col.find(item) # returns -1 if item not found
            #print("col is {} and item is {} and test is {}".format(col,item, test))
    #         print("test val = {}".format(test))
            if test == -1:
                pass
                #print("keeping these = find value {} --- finding {} column value {}".format(test,item, col))

            else:
                #print("droping these = find value {} --- finding {} column value {}".format(test,item, col))
                df.drop(col, axis = 1, inplace = True)
    #             new_cols.remove(col) # useful to keep track of new list of columns in new df - could just use df.column.values though
                break # if its in the drop list drop and stop looking
    return(df)


In [7]:
def get_accuracy(preds,targetvalues):
    acc1 = preds - targetvalues # subtract the arrays - take advantage of broadcasting
    acc2 = np.count_nonzero(acc1) # get the nonzero values - where the model is wrong
    acc3 = 1.000 * (len(acc1) - acc2) / len(acc1) # Number right over total * 1.000 to convert to float
    return(acc3)

In [8]:
# run the helper functions

# get the columns and clean them up
col_list = trans_columns()

# concatenate the test and train data sets
xdata, ydata = get_data(col_list)
xdata = drop_columns(xdata)


In [9]:
# train and test the model using kfolds

kf = KFold(n_splits = 4, random_state = 88 , shuffle = True)
for train_index, test_index in kf.split(xdata):
    
    model = RandomForestClassifier(n_jobs=2)  # create clf model
    model.fit(xdata.iloc[train_index], np.ndarray.ravel(np.array(ydata.iloc[train_index])))
    preds = model.predict(xdata.iloc[test_index])
    targetvalues = np.ndarray.ravel(np.array(ydata.iloc[test_index]))
    
    accuracy = get_accuracy(preds, targetvalues)

    print('Accuracy percentage is: {}'.format(accuracy))
    


Accuracy percentage is: 0.9650485436893204
Accuracy percentage is: 0.9658252427184466
Accuracy percentage is: 0.9669902912621359
Accuracy percentage is: 0.9673659673659674
