In [6]:
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
import pandas as pd
import random
random.seed(42)
pd.options.display.max_rows = 999

In [20]:
def parse_df_via_individual(individual):
    '''This function parses the train and test set via the individual
    passed. The individual is comprised of a list of len(train_set)
    that contains either 0 or 1. 0 or 1 corresponds to whether 
    that specific column should be used.
    
    Args:
        individual (list(int)): The specific individual thats a list comprised of 1s or 0s
        train_set (pd.DataFrame): The dataframe containg the test set
        test_set (pd.DataFrame): The dataframe contain the train set
    Returns:
        (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): The parsed X_train and X_test
    '''
    
    global X_train
    global y_train
    global X_test
    global y_test
    
    bool_individual = list(map(bool, individual))
    bool_df = pd.DataFrame({'a': bool_individual})
    
    X_train_parsed = X_train[X_train.columns[bool_individual]]
    X_test_parsed = X_test[X_test.columns[bool_individual]]
    
    return (X_train_parsed, y_train, X_test_parsed, y_test)


def calculate_correlation(arr):
    '''Function takes the resulting split done by the train_test_split
    function and converts it to a pandas DataFrame and then returns 
    the sum of the sum of the correlation generated by the corr function.
    
    Args:
        arr (np.array(...)): The specific 'X_train' generated by the 
                             train_test_split function
                             
    Returns:
        float: The resulting correlation calculation
    
    '''
    df = pd.DataFrame(arr)
    correlation = df.corr().sum().sum()
    
    return correlation


def load_data(files=['BRAF_train_moe_class.csv', 'BRAF_test_moe_class.csv'], drop = True):
    '''Loads the data from the specific two files given 
    and then concatenates the two to a single DataFrame.
    The function also drops the specific rows that contain
    rows as well as converting the data format to float
    
    Args:
        files (list(str))(optional): The specific file list of csvs to parse
        drop (bool): Whether or not to use the files with the dropped column
                     this column will then have to be parsed and coverted to a 
                     format that works with the calculations (int/ float)
                     
    Returns:
        pd.DataFrame: The resulting DataFrame from the format change
    '''
    
    if drop:
        train_df = pd.read_csv(files[0]).drop('PUBCHEM_COORDINATE_TYPE', axis=1)
        test_df = pd.read_csv(files[1]).drop('PUBCHEM_COORDINATE_TYPE', axis=1)
    else:
        train_df = pd.read_csv(files[0])
        test_df = pd.read_csv(files[1])
        
    df = pd.concat(objs = [train_df, test_df], join = 'inner')
    df = df.dropna(axis = 0)
    df = df.astype(float)
    
    return df


def generate_individual(X_train):
    '''Takes the X_train dataset generated by the 
    train_test_split and returns a list of integers that represent
    boolean values via either 0 or 1. 
    
    Args:
        X_train (pd.DataFrame): The specific X_train generated by 
                                the train_test_split function. 
                                
    Returns:
        list(int): The specific individual generated by the building a
                   list of integers are either 1 or 0
                   
    Example:
        >>> generate_individual(X_train)
        [ 1, 0, 1, 1, 1, ...]'''
    return [random.randint(0, 1) for x in range(len(X_train.columns))]


def fitness(individual):
    """Function that outputs the error rate given a train and 
    test set.
    
    The function takes an individual, a train set, and a test 
    set. The individual is comprised of a list of len(train_set)
    that contains either 0 or 1. 0 or 1 corresponds to whether 
    that specific column should be used. The train set and test 
    set are used to fit and predict the values in the test set
    
    Args:
        individual (list(int)): The specific individual thats a list comprised of 1s or 0s
        train_set (pd.DataFrame): The dataframe containg the test set
        test_set (pd.DataFrame): The dataframe contain the train set
        
    Returns:
        double: The error rate give the columns passed by the individual
        
    Example:
        >>> individual = [random.randint(0, 1) for x in len(X_train.columns)]
        >>> fitness(individual, (X_train, y_train), (X_test, y_test))
        0.30
        
    """
    
    
    X_train_in, y_train_in, X_test_in, y_test_in = parse_df_via_individual(individual)
    
    if len(X_train.columns) == 0:    
        return 1.0
    
    X_train_scaled, X_test_scaled = scale(X_train_in), scale(X_test_in)
    
    clf = svm.SVC()
    clf.fit(X_train_scaled, y_train_in)
    y_pred = clf.predict(X_test_scaled)
    
    acc = accuracy_score(y_true=y_test_in, y_pred=y_pred)
    total_correlation = calculate_correlation(X_train_in)
    
    return acc * total_correlation

In [21]:
df = load_data()
y = df['class']
X = df.drop(['class'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

result = []
for _ in range(200):
    individual = generate_individual(X_train)
    resulting_calculation = fitness(individual)
    result.append(resulting_calculation)

















In [22]:
result

[1740.5209479637408,
 2745.1945516243577,
 2204.9837283691495,
 2579.735211624427,
 3283.169080043503,
 3206.813051024101,
 2005.1983949107491,
 2666.4377892392554,
 2087.8052986586567,
 2157.859510972201,
 1707.53661740655,
 2476.8420378208593,
 2248.219727601104,
 2518.8420558283847,
 2649.6645262621328,
 2089.982605463907,
 2710.4849719566737,
 2418.892382738026,
 2249.3595562739183,
 3391.9023619783043,
 2969.856571927635,
 3209.2784481706503,
 2141.926765042285,
 2738.5160448721954,
 2770.6186001291826,
 3141.5167434637333,
 2805.6176833734553,
 2740.385194997804,
 2875.9133213759637,
 2798.576140749969,
 2223.812883771019,
 1890.111413403158,
 2724.9471637180704,
 1876.0624087312253,
 3008.915609929697,
 3402.507439152365,
 3607.934079126031,
 2497.084907843545,
 1924.4278638498267,
 2361.9971417631623,
 2559.844511962996,
 2244.5724846330813,
 3079.6141302118067,
 1566.1892006619214,
 2812.4699570073453,
 2336.202840456551,
 2130.96124368397,
 2410.591706749581,
 3261.4799512270