In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# Import the data set

In [21]:
lab_data = pd.read_csv('../../data/83_Loeschcke_et_al_2000_Thorax_&_wing_traits_lab pops.csv')

# Data cleaning
# TODO: Instead of deleting this row:
# Make the Thorax lenght the median for the classification
# Calculate the wing index as per the paper defini
lab_data = lab_data[lab_data['Thorax_length'] != '.']

In [22]:
lab_data['Thorax_length'] = pd.to_numeric(lab_data['Thorax_length'])
lab_data['wing_loading'] = pd.to_numeric(lab_data['wing_loading'])

In [23]:
lab_data.head()

Unnamed: 0,Species,Population,Latitude,Longitude,Year_start,Year_end,Temperature,Vial,Replicate,Sex,Thorax_length,l2,l3p,l3d,lpd,l3,w1,w2,w3,wing_loading
0,D._aldrichi,Binjour,-25.52,151.45,1994,1994,20,1,1,female,1.238,2.017,0.659,1.711,2.37,2.37,1.032,1.441,1.192,1.914
1,D._aldrichi,Binjour,-25.52,151.45,1994,1994,20,1,1,male,1.113,1.811,0.609,1.539,2.148,2.146,0.938,1.299,1.066,1.928
2,D._aldrichi,Binjour,-25.52,151.45,1994,1994,20,1,2,female,1.215,1.985,0.648,1.671,2.319,2.319,0.991,1.396,1.142,1.908
3,D._aldrichi,Binjour,-25.52,151.45,1994,1994,20,1,2,male,1.123,1.713,0.596,1.495,2.091,2.088,0.958,1.286,1.062,1.86
4,D._aldrichi,Binjour,-25.52,151.45,1994,1994,20,2,1,female,1.218,1.938,0.641,1.658,2.298,2.298,1.01,1.418,1.148,1.886


In [24]:
lab_data = shuffle(lab_data)
lab_data['class'] = lab_data.apply(
    lambda x: f"{x['Species']}, {x['Sex']}",
    axis=1
)
lab_data.head()

Unnamed: 0,Species,Population,Latitude,Longitude,Year_start,Year_end,Temperature,Vial,Replicate,Sex,...,l2,l3p,l3d,lpd,l3,w1,w2,w3,wing_loading,class
542,D._aldrichi,Oxford_Downs,-21.77,148.85,1994,1994,20,6,3,female,...,1.901,0.638,1.623,2.261,2.259,1.031,1.382,1.185,1.908,"D._aldrichi, female"
938,D._buzzatii,Binjour,-25.52,151.45,1994,1994,25,7,2,male,...,1.659,0.572,1.398,1.97,1.969,0.877,1.196,1.006,1.757,"D._buzzatii, male"
718,D._aldrichi,Wahruna,-25.2,151.17,1994,1994,20,9,1,female,...,1.814,0.647,1.554,2.201,2.2,0.988,1.34,1.109,1.906,"D._aldrichi, female"
737,D._aldrichi,Wahruna,-25.2,151.17,1994,1994,25,2,1,male,...,1.605,0.512,1.448,1.959,1.959,0.892,1.211,0.986,1.661,"D._aldrichi, male"
1702,D._buzzatii,Wahruna,-25.2,151.17,1994,1994,30,6,1,female,...,1.672,0.592,1.373,1.965,1.965,0.877,1.223,0.974,1.754,"D._buzzatii, female"


In [25]:

train, test = train_test_split(lab_data, test_size=0.3)

In [26]:
number_columns = train.select_dtypes(include=['int64', 'float64']).columns

scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(train[number_columns])

train[number_columns] = scaler.transform(train[number_columns])
test[number_columns] = scaler.transform(test[number_columns])

In [27]:
print(number_columns)
train.info()

Index(['Latitude', 'Longitude', 'Year_start', 'Year_end', 'Temperature',
       'Vial', 'Replicate', 'Thorax_length', 'l2', 'l3p', 'l3d', 'lpd', 'l3',
       'w1', 'w2', 'w3', 'wing_loading'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 1211 entries, 125 to 839
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Species        1211 non-null   object 
 1   Population     1211 non-null   object 
 2   Latitude       1211 non-null   float64
 3   Longitude      1211 non-null   float64
 4   Year_start     1211 non-null   float64
 5   Year_end       1211 non-null   float64
 6   Temperature    1211 non-null   float64
 7   Vial           1211 non-null   float64
 8   Replicate      1211 non-null   float64
 9   Sex            1211 non-null   object 
 10  Thorax_length  1211 non-null   float64
 11  l2             1211 non-null   float64
 12  l3p            1211 non-null   float64
 13  l3d            12

In [28]:
classifier = LogisticRegression(multi_class='multinomial')

classifier.fit(
    train[['Thorax_length', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'wing_loading']],
    train[['class']]
)

  y = column_or_1d(y, warn=True)


In [29]:
train['class_predictions'] = classifier.predict(train[['Thorax_length', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'wing_loading']])

In [30]:
train.head()

Unnamed: 0,Species,Population,Latitude,Longitude,Year_start,Year_end,Temperature,Vial,Replicate,Sex,...,l3p,l3d,lpd,l3,w1,w2,w3,wing_loading,class,class_predictions
125,D._aldrichi,Binjour,0.365482,0.722222,0.0,0.0,1.0,0.333333,0.5,male,...,0.09894,0.756507,0.742042,0.741522,0.730627,0.189824,0.715434,0.866798,"D._aldrichi, male","D._aldrichi, male"
1205,D._buzzatii,Grandchester,0.0,1.0,0.0,0.0,0.0,0.111111,0.5,male,...,0.639576,0.857143,0.877222,0.877585,0.854244,0.569472,0.855305,0.929946,"D._buzzatii, male","D._buzzatii, male"
1482,D._buzzatii,Oxford_Downs,1.0,0.0,0.0,0.0,0.5,0.888889,0.0,male,...,0.473498,0.824176,0.833816,0.83416,0.801661,0.420744,0.799035,0.901825,"D._buzzatii, male","D._buzzatii, male"
1702,D._buzzatii,Wahruna,0.419628,0.644444,0.0,0.0,1.0,0.555556,0.0,female,...,0.469965,0.794101,0.812319,0.812655,0.809041,0.430528,0.782958,0.865318,"D._buzzatii, female","D._aldrichi, female"
1712,D._buzzatii,Wahruna,0.419628,0.644444,0.0,0.0,1.0,0.666667,1.0,male,...,0.176678,0.742626,0.741215,0.741108,0.76845,0.242661,0.743569,0.867292,"D._buzzatii, male","D._aldrichi, male"


In [31]:
#Correct predictions
train.apply(
    lambda x: 1 if x['class_predictions'] == x['class'] else 0,
    axis=1
).sum()

585

In [32]:
#Incorrect predictions
train.apply(
    lambda x: 1 if x['class_predictions'] != x['class'] else 0,
    axis=1
).sum()

626

In [33]:
# Retry with different parameters and regularization


In [34]:
regularised_classifier = LogisticRegression(C=1e5, solver='newton-cg', multi_class='multinomial')

regularised_classifier.fit(
    train[['Thorax_length', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'wing_loading']],
    train[['class']]
)

  y = column_or_1d(y, warn=True)


In [35]:
train['regularised_class_predictions'] = regularised_classifier.predict(train[['Thorax_length', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'wing_loading']])

In [36]:
#Correct deep predictions
train.apply(
    lambda x: 1 if x['regularised_class_predictions'] == x['class'] else 0,
    axis=1
).sum()

732

In [37]:
test['softmax_predictions'] = classifier.predict(test[['Thorax_length', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'wing_loading']])
test['regularised_softmax_predictions'] = regularised_classifier.predict(test[['Thorax_length', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'l2', 'l3p', 'wing_loading']])


correct_softmax_classification_test = test.apply(
    lambda x: 1 if x['softmax_predictions'] == x['class'] else 0,
    axis=1
).sum()

print("Softmax correct predictions test set:" + str(correct_softmax_classification_test))

correct_regularised_classification_test = test.apply(
    lambda x: 1 if x['regularised_softmax_predictions'] == x['class'] else 0,
    axis=1
).sum()

print("Regularised softmax predictions test set:" + str(correct_regularised_classification_test))


Softmax correct predictions test set:239
Regularised softmax predictions test set:294


In [38]:
print(f"accuracy: {correct_regularised_classification_test / len(test)}")

accuracy: 0.5664739884393064
