In [157]:
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
import pandas as pd
import random
random.seed(42)
pd.options.display.max_rows = 999

In [396]:
def fitness(individual, train_set, test_set):
    """Function that outputs the error rate given a train and 
    test set.
    
    The function takes an individual, a train set, and a test 
    set. The individual is comprised of a list of len(train_set)
    that contains either 0 or 1. 0 or 1 corresponds to whether 
    that specific column should be used. The train set and test 
    set are used to fit and predict the values in the test set
    
    Args:
        individual (list(int)): The specific individual thats a list comprised of 1s or 0s
        train_set (pd.DataFrame): The dataframe containg the test set
        test_set (pd.DataFrame): The dataframe contain the train set
        
    Returns:
        double: The error rate give the columns passed by the individual
        
    Example:
        >>>> individual = [ra]
    """
    
    bool_individual = list(map(bool,individual))
    bool_df = pd.DataFrame({'a':bool_individual})
    
    X_train, y_train = train_set
    X_test, y_test = test_set
    
    X_train = X_train[X_train.columns[bool_individual]]
    X_test = X_test[X_test.columns[bool_individual]]
    
    if len(X_train.columns) == 0:    
        return 1.0
    
    clf = svm.SVC(gamma='scale')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    error = mean_squared_error(y_true=y_test, y_pred=y_pred)
    
    return error

In [401]:
train_set = pd.read_csv('BRAF_train_moe_class.csv').drop('PUBCHEM_COORDINATE_TYPE', axis=1)
test_set = pd.read_csv('BRAF_test_moe_class.csv').drop('PUBCHEM_COORDINATE_TYPE', axis=1)

train_set = train_set.dropna(axis=0)
test_set = test_set.dropna(axis=0)

y_train = train_set['class']
X_train = train_set.drop(['class'], axis=1)

y_test = test_set['class']
X_test = test_set.drop(['class'], axis=1)

individual = [random.randint(0, 1) for x in range(len(X_train.columns))]

fitness(individual, (X_train, y_train), (X_test, y_test))

0.26666666666666666

0.1