# Import dependencies

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC  
from sklearn.model_selection import train_test_split  
import matplotlib.pyplot as plt  
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import collections
%matplotlib inline

# Declare configurations

In [21]:
cows = ['Cow_A', 'Cow_B', 'Cow_C']
folder2save = 'extracted_features/'
folder2save_heat_labels = 'heat_labels/'
scorer = 'sky'
cow2model = ['Cow_A', 'Cow_B', 'Cow_C']
feature_types = ['ExtractedFeatures_Sky_', 'Mechanics_previous_Features_Sky_', 'CoordinateFeatures_Sky_']

# Read extracted feature files

In [29]:
extracted_features = collections.defaultdict(dict)
for cow in cows:
    for feature_type in feature_types:
        extracted_features[cow][feature_type] = pd.read_hdf(folder2save + feature_type + cow + '.h5')

# Read label files

In [56]:
heat_labels = dict()
for cow in cows:
    heat_labels[cow] = pd.read_csv(folder2save_heat_labels + "ImpliedHeatLabels_" + scorer + "_" + cow + ".csv", index_col=0)

for cow in cows:
    for index in heat_labels[cow]['implying'].index:
        label = heat_labels[cow]['implying'][index]
        if (label == 2) or (label == 3):
            heat_labels[cow]['implying'][index] = 1
        elif (label == 4):
            heat_labels[cow]['implying'][index] = 0

# Define computable variables

In [57]:
for cow in cows:
    for feature_type in feature_types:
        for feature in extracted_features[cow][feature_type].columns:
            extracted_features[cow][feature_type][feature][extracted_features[cow][feature_type][feature] == np.Infinity] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


# Combine data

In [58]:
dataset = dict()
for cow in cows:
    data2concat = list()
    for feature_type in feature_types:
        data2concat.append(extracted_features[cow][feature_type])
    data2concat.append(heat_labels[cow])
    dataset[cow] = pd.concat(data2concat, axis=1)

In [59]:
dataset['Cow_A'].head(5)

Unnamed: 0,nearest_dist_head2head,nearest_cow_head2head,nearest_dist_head2bottom,nearest_cow_head2bottom,nearest_dist_head2body,nearest_cow_head2body,head_appearance,nearest_dist_bottom2head,nearest_cow_bottom2head,nearest_dist_bottom2bottom,...,distance_Body_,velocity_Body_,acceleration_Body_,Coordi_Nose_x,Coordi_Nose_y,Coordi_Bottom_x,Coordi_Bottom_y,Coordi_Body_x,Coordi_Body_y,implying
data/img0000.png,648.901134,3,125.241924,2,138.939669,2,1,574.188793,3,265.719328,...,0.0,0.0,0.0,645.333,289.067,54.4,388.267,234.24,307.2,1
data/img0250.png,0.0,0,0.0,0,0.0,0,0,0.0,0,190.097238,...,493.559104,246.779552,123.389776,0.0,0.0,987.733,362.667,719.36,398.08,0
data/img0251.png,0.0,0,0.0,0,0.0,0,0,0.0,0,189.471139,...,1.28,0.426667,0.142222,0.0,0.0,987.733,362.667,719.36,399.36,0
data/img0252.png,0.0,0,0.0,0,0.0,0,0,0.0,0,186.080342,...,0.0,0.0,0.0,0.0,0.0,987.733,362.667,719.36,399.36,0
data/img0253.png,0.0,0,0.0,0,0.0,0,0,0.0,0,191.25507,...,0.0,0.0,0.0,0.0,0.0,987.733,362.667,719.36,399.36,0


In [42]:
for cow in cows:
    dataset[cow].rename(columns={
        'implying': 'implying_' + cow
    }, inplace=True)

# Remove unusable samples

In [60]:
dataset_arranged = dict()
for cow in cows:
    desired_samples = list()
    for index in range(len(dataset[cow].index)):
        if (dataset[cow].iloc[index]['head_appearance'] != 0) or (dataset[cow].iloc[index]['bottom_appearance'] != 0) or (dataset[cow].iloc[index]['body_appearance'] != 0):
            desired_samples.append(index)
            
    dataset_arranged[cow] = dataset[cow].iloc[desired_samples]

In [63]:
dataset_arranged['Cow_A'].head(5)

Unnamed: 0,nearest_dist_head2head,nearest_cow_head2head,nearest_dist_head2bottom,nearest_cow_head2bottom,nearest_dist_head2body,nearest_cow_head2body,head_appearance,nearest_dist_bottom2head,nearest_cow_bottom2head,nearest_dist_bottom2bottom,...,distance_Body_,velocity_Body_,acceleration_Body_,Coordi_Nose_x,Coordi_Nose_y,Coordi_Bottom_x,Coordi_Bottom_y,Coordi_Body_x,Coordi_Body_y,implying
data/img0000.png,648.901134,3,125.241924,2,138.939669,2,1,574.188793,3,265.719328,...,0.0,0.0,0.0,645.333,289.067,54.4,388.267,234.24,307.2,1
data/img0250.png,0.0,0,0.0,0,0.0,0,0,0.0,0,190.097238,...,493.559104,246.779552,123.389776,0.0,0.0,987.733,362.667,719.36,398.08,0
data/img0251.png,0.0,0,0.0,0,0.0,0,0,0.0,0,189.471139,...,1.28,0.426667,0.142222,0.0,0.0,987.733,362.667,719.36,399.36,0
data/img0252.png,0.0,0,0.0,0,0.0,0,0,0.0,0,186.080342,...,0.0,0.0,0.0,0.0,0.0,987.733,362.667,719.36,399.36,0
data/img0253.png,0.0,0,0.0,0,0.0,0,0,0.0,0,191.25507,...,0.0,0.0,0.0,0.0,0.0,987.733,362.667,719.36,399.36,0


# Split data for modelling

In [None]:
X = dict()
y = dict()
for cow in cows:
    X[cow] = dataset_arranged[cow].drop('implying', axis=1)
    y[cow] = dataset_arranged[cow]['implying']

In [None]:
X_train = dict()
X_test = dict()
y_train = dict()
y_test = dict()
for cow in cows:
    X_train[cow], X_test[cow], y_train[cow], y_test[cow] = train_test_split(X[cow], y[cow], test_size = 0.20)

# Multiple Models

### Step 1: Normalization

In [119]:
X_train_scaled = dict()
X_test_scaled = dict()
for cow in cows:
    scaler = preprocessing.StandardScaler().fit(X_train[cow])
    X_train_scaled[cow] = scaler.transform(X_train[cow])
    X_test_scaled[cow] = scaler.transform(X_test[cow])

  return self.partial_fit(X, y)
  """
  
  return self.partial_fit(X, y)
  """
  
  return self.partial_fit(X, y)
  """
  


### Step 2: Modelling

In [121]:
C_range = [0.1, 1, 10, 100, 1000]
gamma_range = [0.001, 0.0001]
param_grid = dict(gamma=gamma_range, C=C_range)

In [122]:
# GridSearch
best_parameters = dict()
svc = SVC(kernel='rbf')
for cow in cow2model:
    grid = GridSearchCV(svc, param_grid, cv=5)
    grid.fit(X_train_scaled[cow], y_train[cow])
    best_parameters[cow] = grid.best_params_

In [123]:
# Specify cow to be modeled
svclassifiers = dict()
for cow in cow2model:
    svclassifiers[cow] = SVC(kernel='rbf', C=best_parameters[cow]['C'], gamma=best_parameters[cow]['gamma'])  
    svclassifiers[cow].fit(X_train_scaled[cow], y_train[cow])

In [124]:
# Get predictions with unseen data
y_pred = dict()
for cow in cow2model:
    y_pred[cow] = svclassifiers[cow].predict(X_test_scaled[cow])

### Step 3: Evaluation

In [126]:
# # Configurations
print("**********************************************************************")
print("Number of features:", X_train_scaled['Cow_A'].shape[1])
print("Training set:")
for cow in cow2model:
    print("    " + cow + " -> " + str(X_train_scaled[cow].shape[0]) + " samples")
print("Testing set:")
for cow in cow2model:
    print("    " + cow + " -> " + str(X_test_scaled[cow].shape[0]) + " samples")
print("Type of Model: Multiple Models")
print("**********************************************************************\n")
# for cow in cow2model:
#     print(cow, '-----------------------------------------')
#     print(classification_report(sm_y_test[cow], sm_y_pred[cow]))
    
# # Optional
for cow in cow2model:
    print("Prediction results:", cow, '-----------------------------')
    print("Confusion Matrix :")
    print(confusion_matrix(y_test[cow], y_pred[cow]))
    print('Accuracy =', accuracy_score(y_test[cow], y_pred[cow]))
    print('F1 score (binary) =', f1_score(y_test[cow], y_pred[cow]))
    print('')

**********************************************************************
Number of features: 36
Training set:
    Cow_A -> 1178 samples
    Cow_B -> 1105 samples
    Cow_C -> 1134 samples
Testing set:
    Cow_A -> 295 samples
    Cow_B -> 277 samples
    Cow_C -> 284 samples
Type of Model: Multiple Models
**********************************************************************

Prediction results: Cow_A -----------------------------
Confusion Matrix :
[[174  16]
 [ 16  89]]
Accuracy = 0.8915254237288136
F1 score (binary) = 0.8476190476190476

Prediction results: Cow_B -----------------------------
Confusion Matrix :
[[163  13]
 [ 23  78]]
Accuracy = 0.8700361010830325
F1 score (binary) = 0.8125

Prediction results: Cow_C -----------------------------
Confusion Matrix :
[[177  13]
 [ 15  79]]
Accuracy = 0.9014084507042254
F1 score (binary) = 0.849462365591398



# Single Model

### Step 1: Concat the training sets

In [83]:
X2concat = list()
y2concat = list()
for cow in cows:
    X2concat.append(X_train[cow])
    y2concat.append(y_train[cow])
sm_X_train = pd.concat(X2concat)
sm_y_train = pd.concat(y2concat)

# -- Define test set for single model
sm_X_test = X_test
sm_y_test = y_test

### Step 2: Normalization

In [84]:
# 2nd: Normalization
scaler = preprocessing.StandardScaler().fit(sm_X_train)
sm_X_train_scaled = scaler.transform(sm_X_train)
sm_X_test_scaled = dict()
for cow in cows:
    sm_X_test_scaled[cow] = scaler.transform(sm_X_test[cow])

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  
  
  


### Step 3: Modelling

In [91]:
C_range = [0.1, 1, 10, 100, 1000]
gamma_range = [0.001, 0.0001]
param_grid = dict(gamma=gamma_range, C=C_range)

In [92]:
# GridSearch
best_parameters = dict()
svc = SVC(kernel='rbf')
grid = GridSearchCV(svc, param_grid, cv=5)
grid.fit(sm_X_train_scaled, sm_y_train)
best_parameters = grid.best_params_

In [93]:
# Show best params
best_parameters

{'C': 1000, 'gamma': 0.001}

In [94]:
# Model
classifier = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'])  
classifier.fit(sm_X_train_scaled, sm_y_train)  

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [100]:
# Get predictions with unseen data
sm_y_pred = dict()
for cow in cow2model:
    sm_y_pred[cow] = classifier.predict(sm_X_test_scaled[cow])

### Step 4: Evaluation

In [118]:
# # Configurations
print("**********************************************************************")
print("Number of features:", sm_X_train_scaled.shape[1])
print("Training set:", sm_X_train_scaled.shape[0], "samples")
print("Testing set:")
for cow in cow2model:
    print("    " + cow + " -> " + str(sm_X_test_scaled[cow].shape[0]) + " samples")
print("Model:", classifier)
print("Type of Model: Single Model")
print("**********************************************************************\n")
# for cow in cow2model:
#     print(cow, '-----------------------------------------')
#     print(classification_report(sm_y_test[cow], sm_y_pred[cow]))
    
# # Optional
for cow in cow2model:
    print("Prediction results:", cow, '-----------------------------')
    print("Confusion Matrix :")
    print(confusion_matrix(sm_y_test[cow], sm_y_pred[cow]))
    print('Accuracy =', accuracy_score(sm_y_test[cow], sm_y_pred[cow]))
    print('F1 score (binary) =', f1_score(sm_y_test[cow], sm_y_pred[cow]))
    print('')

**********************************************************************
Number of features: 36
Training set: 3417 samples
Testing set:
    Cow_A -> 295 samples
    Cow_B -> 277 samples
    Cow_C -> 284 samples
Model: SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Type of Model: Single Model
**********************************************************************

Prediction results: Cow_A -----------------------------
Confusion Matrix :
[[169  21]
 [ 18  87]]
Accuracy = 0.8677966101694915
F1 score (binary) = 0.8169014084507044

Prediction results: Cow_B -----------------------------
Confusion Matrix :
[[163  13]
 [ 31  70]]
Accuracy = 0.8411552346570397
F1 score (binary) = 0.7608695652173914

Prediction results: Cow_C -----------------------------
Confusion Matrix :
[[170  20]
 [ 11  83]]
Accuracy = 0.890845070422

### Special step: Implementation of k-fold cross validation

In [None]:
kf = KFold(n_splits=5)
for cow in cow2model:
    print(cow, '----------------------')
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[cow][train_index], X_scaled[cow][test_index]
        y_train, y_test = y[cow][train_index], y[cow][test_index]
        
        clf = SVC(kernel='rbf', C=best_parameters[cow]['C'], gamma=best_parameters[cow]['gamma'])
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        print(classification_report(y_test,y_pred))

# Count heat-implying events

In [56]:
all_heat_implies = 0
for cow in cows:
    specified_cow_implies = 0
    for imply in y[cow]:
        if imply == 1:
            specified_cow_implies += 1
    print(cow, ':', specified_cow_implies, "(2000 images)")
    all_heat_implies += specified_cow_implies
print('All heat implying events:', all_heat_implies)

Cow_A : 580 (2000 images)
Cow_B : 488 (2000 images)
Cow_C : 485 (2000 images)
All heat implying events: 1553
