# AMP Feature Analysis For b278 VVV Tile

- **author:** JB Cabral (<jbc.develop@gmail.com>)

In [4]:
% matplotlib inline

import numpy as np

import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

from sklearn import feature_selection as fs
from sklearn import preprocessing as prp
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors

from libs import fourier_help

pd.options.mode.chained_assignment = None 

TWO_LABELS = {-1: -1, 1: 1, 2: 1, 3: 1}

## 1. Load Data

In [2]:
df = pd.read_csv("data/amp.csv")
print "features:", len(df.columns) - 3

features: 72


##  2. Removes all low-variance and "bad" features

In [5]:
# columns with nan and null
df = df.loc[:, ~df.isnull().any()]

X_columns = df.columns[~df.columns.isin(["vvv_id", "cls", "scls"])]
y = df["scls"].values

# low variance
vt = fs.VarianceThreshold()
vt.fit(df[X_columns].values, y)

X_columns = X_columns[vt.get_support()]
X_scaled = prp.StandardScaler().fit_transform(df[X_columns].values)

print "total features:", len(X_columns)

total features: 62


## 3. Machine Learning

In [6]:
def experiment(clf, x, y, nfolds=10):
    skf = StratifiedKFold(n_splits=nfolds)
    probabilities = np.array([])
    predictions = np.array([])
    y_testing = np.array([])
    
    for train, test in skf.split(x, y):
        
        x_train = x[train]
        y_train = y[train]
        clf.fit(x_train, y_train)

        x_test = x[test]
        y_test = y[test]
        pr = clf.predict(x_test)
        probs = clf.predict_proba(x_test)[:, 0]

        probabilities = np.hstack([probabilities, probs])
        predictions = np.hstack([predictions, pr])
        y_testing = np.hstack([y_testing, y_test])

    print metrics.classification_report(y_testing, predictions)
    fpr, tpr, thresholds = metrics.roc_curve(y_testing, 1. - probabilities)
    roc_auc = metrics.auc(fpr, tpr)
    return {'fpr': fpr, 
            'tpr': tpr, 
            'thresh': thresholds, 
            'roc_auc': roc_auc, 
            'y_test': y_testing, 
            'predictions': predictions,
            'probabilities': probabilities, 
            'confusion_matrix': metrics.confusion_matrix(y_testing, predictions),
            }

### 3.1. SVM - Linear

In [12]:
%time svc_linear = experiment(svm.SVC(kernel='linear', probability=True), X_scaled, y)

             precision    recall  f1-score   support

       -1.0       0.99      1.00      0.99     20000
        1.0       0.92      0.49      0.64       424

avg / total       0.99      0.99      0.99     20424

CPU times: user 4min 26s, sys: 2.45 s, total: 4min 28s
Wall time: 4min 30s


### 3.2 SVM - Polynomic

In [13]:
%time svc_poly = experiment(svm.SVC(kernel='poly', probability=True), X_scaled, y)

             precision    recall  f1-score   support

       -1.0       0.99      1.00      0.99     20000
        1.0       0.89      0.42      0.57       424

avg / total       0.99      0.99      0.98     20424

CPU times: user 2min 39s, sys: 1.61 s, total: 2min 41s
Wall time: 2min 41s


### 3.3. Random Forest

In [23]:
%time rf = experiment(RandomForestClassifier(n_estimators=500, criterion="entropy"), X_scaled, y)

             precision    recall  f1-score   support

       -1.0       0.99      1.00      1.00     20000
        1.0       0.97      0.54      0.70       424

avg / total       0.99      0.99      0.99     20424

CPU times: user 8min 45s, sys: 229 ms, total: 8min 45s
Wall time: 8min 45s


### 3.4. KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier

%time knn = experiment(KNeighborsClassifier(n_neighbors=3, weights='distance'), X_scaled, y)

             precision    recall  f1-score   support

       -1.0       0.99      1.00      0.99     20000
        1.0       0.83      0.45      0.58       424

avg / total       0.99      0.99      0.98     20424

CPU times: user 2min 31s, sys: 48 ms, total: 2min 31s
Wall time: 2min 32s
