#  Project 1 - Bee Subspecie Classification (logistic regression vs neural network approach) 

### Import relevant libraries

In [47]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#to load matlab mat files
from scipy.io import loadmat

size = 50

## PART 2: Load data

In [51]:
ds = pd.read_csv('bee_dataset/ml_data.csv')
features=ds['features']
y1=ds['class']
y2 = {'Italian honey bee':1,'Carniolan honey bee':2,
      'Russian honey bee':3,'1 Mixed local stock 2':4,'Western honey bee':5}
y = (pd.Series(y1)).map(y2)
y = np.array(y)
#y = np.reshape(y, (-1, 1))
#print(type(y))
#print(y)
n_examples = features.shape[0]
feature_size = size*size
X = np.empty((n_examples,feature_size), float)
for idx,f in enumerate(features):
        arr_f = np.fromstring(f, dtype=float, sep=' ')
        arr_f = np.reshape(arr_f, (1,arr_f.shape[0]))
        X[idx,:] = np.copy(arr_f)
#X = np.copy(arr)
#print(y.shape)
#print(X.shape)

# 3 way split: Train/Dev/Test Sets
# As we use a small data set (up to 10000 examples) we will use this division: 60%/20%/20%
# One approach to dividing the dataset into train, test, cv with 0.6, 0.2, 0.2 would be to use the train_test_split method twice.

x, x_test, y, y_test = train_test_split(X,y,test_size=0.2,train_size=0.8)
x_train, x_cv, y_train, y_cv = train_test_split(x,y,test_size = 0.25,train_size =0.75)

print('X_train.shape: ' + str(x_train.shape))
print('y_train.shape: ' + str(y_train.shape))
print('x_cv.shape: ' + str(x_cv.shape))
print('y_cv.shape: ' + str(y_cv.shape))
print('X_test.shape: ' + str(x_test.shape))
print('y_test.shape: ' + str(y_test.shape))

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LogisticRegression(max_iter=7000)
model.fit(x_train, y_train)
result = model.score(x_test, y_test)
print("Accuracy score: {}".format(result))
#print (X_train.shape, y_train.shape)
#print (X_test.shape, y_test.shape)

X_train.shape: (304, 2500)
y_train.shape: (304,)
x_cv.shape: (102, 2500)
y_cv.shape: (102,)
X_test.shape: (102, 2500)
y_test.shape: (102,)
Accuracy score: 0.6666666666666666


In [54]:
#K-Fold Cross-Validation
#Pros: The whole dataset is used as both a training set and validation set
#Cons: Not to be used for imbalanced datasets

kf=KFold(n_splits=5)
score=cross_val_score(model,x_train,y_train,cv=kf)
print("Cross Validation Scores: {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores: [0.75409836 0.67213115 0.6557377  0.67213115 0.78333333]
Average Cross Validation score :0.7074863387978142


In [55]:
#Stratified K-Fold Cross-Validation
#Pros: Works perfectly well for Imbalanced Data
#Cons: Not suitable for Time Series data

stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(model,x_train,y_train,cv=stratifiedkf)
print("Cross Validation Scores: {}".format(score))
print("Average Cross Validation score:{}".format(score.mean()))

Cross Validation Scores: [0.7704918  0.73770492 0.67213115 0.59016393 0.78333333]
Average Cross Validation score:0.7107650273224044


In [None]:
#Leave One Out cross-validation

loo=LeaveOneOut()
score=cross_val_score(model,x_train,y_train,cv=loo)
print("Cross Validation Scores: {}".format(score))
print("Average Cross Validation score:{}".format(score.mean()))