In [47]:
import time, datetime
from contextlib import contextmanager
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib


In [48]:
@contextmanager
def measure_time(label):
    """
    Context manager to measure time of computation.
    """
    start = time.time()
    yield
    end = time.time()
    print('Duration of [{}]: {}'.format(label, datetime.timedelta(seconds=end-start)))

# Load data

In [49]:
def load_data(path, to_split=True):
    """
    Load the csv file and returns (X,y).
    """
    # Read the csv file
    df = pd.read_csv(path, header=0, index_col=0)

    # Get the output values
    y = df['crowd_class'].values.squeeze()

    # Get the input values
    feature = 'cluster'
    X = df[feature].values.squeeze()
    X = X.reshape(-1, 1)
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if to_split:
        return X_train, X_test, y_train, y_test
    else:
        return X, y


# Load the cleaned data
path = './data/cleaned_HSL_data.csv'
X_train, X_test, y_train, y_test = load_data(path)

# Logistic Regression

In [62]:
from sklearn.linear_model import LogisticRegression


def train(path, to_split=True):
    """
    Train the model.
    """
    filename = "models/lin_reg.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = LogisticRegression(random_state=42)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("Logistic Regression Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    
    if to_split:
        y_pred = model.predict(X_test)
        print("Logistic Regression Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")
        
        
# Train our model
train(path)

Duration of [Training...]: 0:00:00.018964
Logistic Regression Training set accuracy: 0.3699217294073798
Logistic Regression Test set accuracy: 0.353204172876304




# Support Vector Classifier

In [51]:
from sklearn.svm import SVC


def train(path, to_split=True):
    """
    Train the model.
    """
    filename = "models/svc.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = SVC(kernel='rbf', max_iter=100000)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = svm_model.predict(X_train)
    print("=================================================================")
    print("SVM Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    if to_split:
        y_pred = svm_model.predict(X_test)
        print("SVM Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")
        
        
# Train our model
train(path)



Duration of [Training...]: 0:00:00.533622
SVM Training set accuracy: 0.39619828550130454
SVM Test set accuracy: 0.36065573770491804


# Knn

In [61]:
from sklearn.neighbors import KNeighborsClassifier


def train(path, to_split=True):
    """
    Train the model.
    """
    filename = "models/knn.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = KNeighborsClassifier(n_neighbors=100)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("Knn Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    if to_split:
        y_pred = model.predict(X_test)
        print("Knn Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")
        
        
# Train our model
train(path)

Duration of [Training...]: 0:00:00.018732
Knn Training set accuracy: 0.37700335445396943
Knn Test set accuracy: 0.36363636363636365


# Multi-layer Perceptron

In [64]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV


def tune_hyperparameter(path):
    """
    Get the best hyperparameters.
    """
   # Load the training set
    X, y = load_data(path, to_split=False)
        
    # Create the random grid
    random_grid = {'hidden_layer_sizes': [(20,), (50,), (100,), (150,)],
                    'activation': ['tanh', 'relu', 'logistic', 'identity'],
                    'learning_rate_init': [0.005, 0.01, 0.05, 0.1, 0.2, 0.3],
                    'learning_rate': ['constant','adaptive'],
                    'momentum': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    mlp = MLPClassifier(solver='sgd', early_stopping=True)
    # Random search of parameters, using 5 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    mlp_random = RandomizedSearchCV(estimator = mlp,
                                   param_distributions = random_grid,
                                   n_iter = 100,
                                   cv = 5,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs = -1)
    # Fit the random search model
    mlp_random.fit(X, y)

    print("Best parameters", mlp_random.best_params_)
    
    
tune_hyperparameter(path)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.2min finished


Best parameters {'momentum': 0.9, 'learning_rate_init': 0.1, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (150,), 'activation': 'tanh'}


In [65]:
def create_estimator(path, to_split=True):
    """
    Train the model.
    """
    filename = "models/mlp.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = MLPClassifier(solver='sgd', 
                                hidden_layer_sizes = (15,), 
                                early_stopping=True,
                                learning_rate_init= 0.1,
                                learning_rate = 'adaptive',
                                activation='tanh',
                                momentum=0.9)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("MLP Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    if to_split:
        y_pred = model.predict(X_test)
        print("MLP Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")


create_estimator(path)

Duration of [Training...]: 0:00:00.665886
MLP Training set accuracy: 0.3740216175922475
MLP Test set accuracy: 0.36736214605067063
