In [41]:
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
import pandas as pd
import random
random.seed(42)
pd.options.display.max_rows = 999

In [42]:
def parse_df_via_individual(individual, train_set, test_set):
    '''This function parses the train and test set via the individual
    passed. The individual is comprised of a list of len(train_set)
    that contains either 0 or 1. 0 or 1 corresponds to whether 
    that specific column should be used.
    
    Args:
        individual (list(int)): The specific individual thats a list comprised of 1s or 0s
        train_set (pd.DataFrame): The dataframe containg the test set
        test_set (pd.DataFrame): The dataframe contain the train set
    Returns:
        (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): The parsed X_train and X_test
    '''
    
    bool_individual = list(map(bool, individual))
    bool_df = pd.DataFrame({'a': bool_individual})
    
    X_train, y_train = train_set
    X_test, y_test = test_set
    
    X_train = X_train[X_train.columns[bool_individual]]
    X_test = X_test[X_test.columns[bool_individual]]
    
    return (X_train, y_train, X_test, y_test)


def calculate_correlation(arr):
    '''Function takes the resulting split done by the train_test_split
    function and converts it to a pandas DataFrame and then returns 
    the sum of the sum of the correlation generated by the corr function.
    
    Args:
        arr (np.array(...)): The specific 'X_train' generated by the 
                             train_test_split function
                             
    Returns:
        float: The resulting correlation calculation
    
    '''
    df = pd.DataFrame(arr)
    correlation = df.corr().sum().sum()
    
    return correlation


def load_data(files=['BRAF_train_moe_class.csv', 'BRAF_test_moe_class.csv'], drop = True):
    '''Loads the data from the specific two files given 
    and then concatenates the two to a single DataFrame.
    The function also drops the specific rows that contain
    rows as well as converting the data format to float
    
    Args:
        files (list(str))(optional): The specific file list of csvs to parse
        drop (bool): Whether or not to use the files with the dropped column
                     this column will then have to be parsed and coverted to a 
                     format that works with the calculations (int/ float)
                     
    Returns:
        pd.DataFrame: The resulting DataFrame from the format change
    '''
    
    if drop:
        train_df = pd.read_csv(files[0]).drop('PUBCHEM_COORDINATE_TYPE', axis=1)
        test_df = pd.read_csv(files[1]).drop('PUBCHEM_COORDINATE_TYPE', axis=1)
    else:
        train_df = pd.read_csv(files[0])
        test_df = pd.read_csv(files[1])
        
    df = pd.concat(objs = [train_df, test_df], join = 'inner')
    df = df.dropna(axis = 0)
    df = df.astype(float)
    
    return df


def generate_individual(X_train):
    '''Takes the X_train dataset generated by the 
    train_test_split and returns a list of integers that represent
    boolean values via either 0 or 1. 
    
    Args:
        X_train (pd.DataFrame): The specific X_train generated by 
                                the train_test_split function. 
                                
    Returns:
        list(int): The specific individual generated by the building a
                   list of integers are either 1 or 0
                   
    Example:
        >>> generate_individual(X_train)
        [ 1, 0, 1, 1, 1, ...]'''
    return [random.randint(0, 1) for x in range(len(X_train.columns))]


def fitness(individual, train_set, test_set):
    """Function that outputs the error rate given a train and 
    test set.
    
    The function takes an individual, a train set, and a test 
    set. The individual is comprised of a list of len(train_set)
    that contains either 0 or 1. 0 or 1 corresponds to whether 
    that specific column should be used. The train set and test 
    set are used to fit and predict the values in the test set
    
    Args:
        individual (list(int)): The specific individual thats a list comprised of 1s or 0s
        train_set (pd.DataFrame): The dataframe containg the test set
        test_set (pd.DataFrame): The dataframe contain the train set
        
    Returns:
        double: The error rate give the columns passed by the individual
        
    Example:
        >>> individual = [random.randint(0, 1) for x in len(X_train.columns)]
        >>> fitness(individual, (X_train, y_train), (X_test, y_test))
        0.30
        
    """
    
    X_train, y_train, X_test, y_test = parse_df_via_individual(individual, train_set, test_set)
    
    if len(X_train.columns) == 0:    
        return 1.0
    
    X_train, X_test = scale(X_train), scale(X_test)
    
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    error = mean_squared_error(y_true=y_test, y_pred=y_pred)
    total_correlation = calculate_correlation(X_train)
    
    return error * total_correlation

In [43]:
df = load_data()
y = df['class']
X = df.drop(['class'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

result = []
for _ in range(200):
    individual = generate_individual(X_train)
    resulting_calculation = fitness(individual, (X_train, y_train), (X_test, y_test))
    result.append(resulting_calculation)