In [212]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import os

Since I am trying to predict a worm mutation based off the accelerations, I am making one large dataframe with a row for each worm, and the features would be each frame. This way I can compare one worm from start to finish, and be able to do a train test split on the worms.

In [None]:
files = os.listdir('../../data/frame_distances/')
files.remove('.DS_Store')

total = []
for file in files:
    path = '../../data/frame_distances/' + file
    csvs = os.listdir(path)
    for csv in csvs:
        df = pd.read_csv(path + '/' +csv)
        df.mask(df > 10, inplace=True)
        pct_null = df.isnull().sum() / len(df)
        missing_features = pct_null[pct_null > 0.05].index
        df.drop(missing_features, axis=1, inplace = True)
        df.fillna(method = 'ffill', inplace = True)
        df.rolling(15).mean()
        df = df.iloc[14:].copy()
        df.index = range(df.shape[0])
        df = df.T
        df['mutation'] = file
        total.append(df)

total_df = pd.concat(total)

In [122]:
total_df = total_df.loc[:,:'mutation']

In [133]:
X = total_df.drop(['mutation'], axis = 1)
y = total_df['mutation']

In [147]:
X = X.apply(lambda x: x.fillna(x.mean()))

0       0
1       0
2       0
3       0
4       0
       ..
1780    0
1781    0
1782    0
1783    0
1784    0
Length: 1785, dtype: int64

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20, stratify = y)

Fitting the model using SVM for classification and a Gridsearch

In [204]:
svc = SVC()

grid_params = {
    'C' : np.linspace(.1,1,10),
    'degree' : [2,3,5],
}

gs = GridSearchCV(svc, grid_params, cv = 5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'degree': [2, 3, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [178]:
gs.score(X_train, y_train)

0.8184931506849316

In [179]:
gs.score(X_test, y_test)

0.4864864864864865

In [180]:
gs.best_params_

{'C': 1.0, 'degree': 2}

In [None]:
gs.predict(X_test)

#### Predicting a mutated worm or not
Changing y variable to be 1 for worm and 0 for everything else

In [184]:
y_mut = np.where(y == 'worm', 1, 0)

In [188]:
X_train, X_test, y_mut_train, y_mut_test = train_test_split(X, y_mut, random_state = 20, test_size = .2, stratify = y)

In [189]:
gs.fit(X_train, y_mut_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'degree': [2, 3, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [190]:
gs.score(X_test, y_mut_test)

0.7567567567567568

In [192]:
gs.score(X_train, y_mut_train)

0.7636986301369864

### Random Forest Classifier

In [198]:
rf = RandomForestClassifier()

param_grid = {
    'max_depth' : [5, 10],
    'max_features' : [1500, 900, 600],
    'n_estimators': [100, 150, 200]
}

rf_gs = GridSearchCV(rf, param_grid, cv = 5)

rf_gs.fit(X_train, y_mut_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [199]:
rf_gs.score(X_train, y_mut_train)

0.952054794520548

In [200]:
rf_gs.score(X_test, y_mut_test)

0.7567567567567568

In [202]:
y_mut_test

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1])

## Bagging Classifier

Since I am getting all mutations being predicted I figure I should use some bagging to help out the class imbalances

In [213]:
bag = BaggingClassifier()

In [215]:
bag.fit(X_train, y_mut_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)

In [216]:
bag.score(X_test, y_mut_test)

0.7162162162162162

In [218]:
bag.score(X_train, y_mut_train)

0.9657534246575342

In [217]:
bag.predict(X_test)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])