# ------------- User's settings -------------
This is the only section you need to make inputs

In [1]:
# Output from Step 2 is needed as input here
train_data_CSV = 'Cells.csv'

#--- If you have other files to process similarly to train_data ---#
test_data_CSV = 'Cells.csv'

# ------------- Executable -------------
Execute the rest of the script

------------- Logistics -------------

In [2]:
import os, sys
import numpy
import pandas

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import scale, LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectFromModel


#------------- Custom functions -------------#
def variance_threshold_select(df, thresh=0.0, na_replacement=-999):
    df1 = df.copy(deep=True) # Make a deep copy of the dataframe
    selector = VarianceThreshold(thresh)
    selector.fit(df1.fillna(na_replacement)) # Fill NA values as VarianceThreshold cannot deal with those
    df2 = df.loc[:,selector.get_support(indices=False)] # Get new dataframe with columns deleted that have NA values
    return df2

            
def save_metadata(file,labels,ImageNumber,x_cords,y_cords):
    with open(file, 'w') as f:
        f.write('Label\tImageNumber\tX_cordinate\tY_cordinate\n')
        for i in range(len(labels)):
            f.write('{}\t{}\t{}\t{}\n'.format( labels[i], ImageNumber[i], x_cords[i], y_cords[i]))    

------------- Data loading and preprocessing -------------

In [3]:
#----- Define unwanted parameters -----#
not_wanted = [ "Number", "Location", "Center", "Metadata" ]

objects = pandas.read_csv(train_data_CSV, header=0)
variables_object = [x for x in objects.columns.values if numpy.all([not z in x for z in not_wanted])]

objects_train = objects.loc[:,variables_object + ['Metadata_Label'] + ['ImageNumber'] + ['Location_Center_X'] + ['Location_Center_Y']]
print('Original data has shape (rows, columns)              : ', objects_train.shape)

#----- Remove any row that has NA -----#
objects_train = objects_train.dropna(axis = 'columns')
print('After removing NA rows, data has shape               : ', objects_train.shape)


#----- Create ground-truth label and book-keeping records for each object -----#
ground_truth = list(objects_train.Metadata_Label)
ImageNumber = list(objects_train.ImageNumber)
X_cords = list(objects_train.Location_Center_X)
Y_cords = list(objects_train.Location_Center_Y)

# Save labels, to be used as "metadata" on http://projector.tensorflow.org
save_metadata('ground_truth_labels.tsv', ground_truth, ImageNumber, X_cords, Y_cords)


#----- Remove all zero-variance features -----#
# i.e. features that have the same value in all samples.
objects_train = variance_threshold_select(objects_train.loc[:,variables_object])
print('After removing zero-variance features, data has shape: ', objects_train.shape)


#----- Logistic for training data -----#
le = LabelEncoder()
le.fit(ground_truth)
numeric_labels = le.fit_transform(ground_truth)

Original data has shape (rows, columns)              :  (33467, 638)
After removing NA rows, data has shape               :  (33467, 629)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


After removing zero-variance features, data has shape:  (33467, 625)


----- Tree-based feature selection -----

When the dataset has two (or more) correlated features, 
then any of these correlated features can be used as the predictor.
For the computer, there's no concrete preference of one over the others. 

It makes sense to remove features that are mostly duplicated by other features (redundancy)
Tree-based feature selection will help us to (randomly) 
keep only one of them, and remove others.

This is not an issue when we want to use feature selection to reduce overfitting.
But when interpreting the data, it can lead to the incorrect conclusion that 
one of the variables is a strong predictor while the others in the same group are unimportant.

Read [more](http://blog.datadive.net/selecting-good-features-part-iii-random-forests/)

In [4]:
#----- Tree-based feature selection -----#

clf = RandomForestRegressor(n_estimators=20, max_features=2)
clf = clf.fit(objects_train, numeric_labels)
clf.feature_importances_

df1 = objects_train.copy(deep=True) # Make a deep copy of the dataframe
selector = SelectFromModel(clf, prefit=True)
selector.transform(df1)

data_train = objects_train.loc[:,selector.get_support(indices=False)] 
print('After feature selection, data has shape              : ', data_train.shape)

# List of all the feature names
selected_features_names =list(data_train.columns.values)

data_train.to_csv('after_feature_selection_data.csv') 

#----- To be used as main data on http://projector.tensorflow.org -----#
numpy.savetxt( 'after_feature_selection_scaled_data.txt', scale(data_train), delimiter='\t')

After feature selection, data has shape              :  (33467, 187)


------------- Preprocessing testing data accordingly -------------

In [None]:
#----- Load testing data -----#
objects_test = pandas.read_csv(test_data_CSV, header=0)
print('Original test data has shape (rows, columns)                      : ', objects_test.shape)

#----- Apply feature selection rules learned from training data on test data -----#
selected_var_objects_test = objects_test.loc[:,selected_features_names + ['Metadata_Label'] + ['ImageNumber'] + ['Location_Center_X'] + ['Location_Center_Y']]

#----- Remove any row that has NA -----#
selected_var_objects_test = selected_var_objects_test.dropna()

#----- Create labels for test data -----#
test_labels = list(selected_var_objects_test.Metadata_Label)
test_ImageNumber = list(selected_var_objects_test.ImageNumber)
test_X_cords = list(selected_var_objects_test.Location_Center_X)
test_Y_cords = list(selected_var_objects_test.Location_Center_Y)


# Save labels, to be used as "metadata" on http://projector.tensorflow.org
save_metadata('test_labels.tsv', test_labels, test_ImageNumber, test_X_cords, test_Y_cords)


data_test = selected_var_objects_test.loc[:,selected_features_names]
print('After removing NA rows and feature selection, test data has shape : ', data_test.shape)


data_test.to_csv( 'after_feature_selection_testdata.csv' )
#----- To be used as main data on http://projector.tensorflow.org -----#
numpy.savetxt( 'after_feature_selection_scaled_testdata.txt', scale(data_test), delimiter='\t')