In [2]:
%matplotlib inline
import os
import glob
import numpy as np
from scipy import io
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
data = pd.read_csv('train.csv')

y_df = data[['molecule', 'concentration']]
X_df = data.drop(['molecule', 'concentration'], axis=1)
spectra = X_df['spectra'].values                                        
spectra = np.array([np.array(dd[1:-1].split(',')).astype(float) for dd in spectra])    
X_df['spectra'] = spectra.tolist()

In [4]:
# Target for classification
molecule = y_df['molecule'].values
# Target for regression
concentration = y_df['concentration'].values
# "Raw" features
X = spectra

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator

In [12]:
class FeatureExtractorClf(object):
    def __init__(self):
        pass

    def fit(self, X_df, y_df):
        pass
    # Here we just pick the features spectra
    def transform(self, X_df):
        XX = np.array([np.array(dd) for dd in X_df['spectra']])
        return XX

In [26]:
labels = np.array(['A', 'B', 'Q', 'R'])
class Classifier(BaseEstimator):
    def __init__(self,max_features = "auto",max_depth = None,n_components=10):
        self.n_components = n_components
        self.n_estimators = 300
        self.max_depth = max_depth
        self.max_features = max_features
        self.clf = Pipeline([
            ('pca', PCA(n_components=self.n_components)), 
            ('clf', RandomForestClassifier(n_estimators=self.n_estimators, random_state=42, max_depth=self.max_depth, max_features=self.max_features))
        ])

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)
    
    def score(self, X, y):
        labels = np.array(['A', 'B', 'Q', 'R'])
        y_proba_clf = self.predict_proba(X)                        
        y_pred_clf = labels[np.argmax(y_proba_clf, axis=1)]                      
        return accuracy_score(y, y_pred_clf)       

In [17]:
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

labels = np.array(['A', 'B', 'Q', 'R'])

def train_test_model_clf(X_df, y_df, skf_is, FeatureExtractor, Classifier):
    train_is, test_is = skf_is
    X_train_df = X_df.iloc[train_is].copy()                                  
    y_train_df = y_df.iloc[train_is].copy()
    y_train_clf = y_train_df['molecule'].values
    X_test_df = X_df.iloc[test_is].copy()                                    
    y_test_df = y_df.iloc[test_is].copy() 
    y_test_clf = y_test_df['molecule'].values 
    
    
    # Feature extraction
    fe_clf = FeatureExtractor()
    fe_clf.fit(X_train_df, y_train_df)
    X_train_array_clf = fe_clf.transform(X_train_df)
    X_test_array_clf = fe_clf.transform(X_test_df)
    # Train
    clf = Classifier()
        # here y_train_clf is an array of "A" "B"...
    clf.fit(X_train_array_clf, y_train_clf)
    # Test 
    y_proba_clf = clf.predict_proba(X_test_array_clf)                        
    y_pred_clf = labels[np.argmax(y_proba_clf, axis=1)]                      
    error = 1 - accuracy_score(y_test_clf, y_pred_clf)                       
    print('error = %s' % error)                                                                            
    print('classification report:\n %s' % classification_report(y_test_clf, y_pred_clf))
    print('confusion matrix:\n %s' % confusion_matrix(y_test_clf, y_pred_clf))

In [18]:
skf = ShuffleSplit(n_splits=2, test_size=0.2, random_state=57)  
skf_is = list(skf.split(X_df))[0]

train_test_model_clf(X_df, y_df, skf_is, FeatureExtractorClf, Classifier)

error = 0.065
classification report:
              precision    recall  f1-score   support

          A       0.91      0.95      0.93        63
          B       0.91      0.93      0.92        45
          Q       1.00      0.93      0.96        40
          R       0.94      0.92      0.93        52

avg / total       0.94      0.94      0.94       200

confusion matrix:
 [[60  2  0  1]
 [ 2 42  0  1]
 [ 1  1 37  1]
 [ 3  1  0 48]]


In [27]:
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV

X=X_df.copy()
y=y_df.copy()
y=y['molecule'].values

fe_clf1 = FeatureExtractorClf()
fe_clf1.fit(X, y)
X= fe_clf1.transform(X)






max_depth_range = [None, 10, 6]
max_features_range = [None, "auto"]
n_components_range = [20,50]

param_grid = dict(n_components=n_components_range,max_depth=max_depth_range,max_features=max_features_range)
cv = StratifiedShuffleSplit(y, n_iter=3, test_size=0.2, random_state=42)
grid = GridSearchCV(Classifier(), param_grid=param_grid, cv=cv)
grid.fit(X, y)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

KeyboardInterrupt: 