### Ship Images Classification Demo

Demo for simple classifiction of set of ship images from Kaggle
- images and data obtained from https://www.kaggle.com/arpitjain007/game-of-deep-learning-ship-datasets
- use 6252 images in 'train' as whole dataset
- images contain 5 classes of ships, {'Cargo': 1, 'Military': 2, 'Carrier': 3, 'Cruise': 4, 'Tankers': 5}
- extract deep features for each image using function in ImageFeatureExtractor
- split images into train and test sets and perform logistic regression and random forest classification


In [1]:
#import relevant libraries and function
import pandas as pd
import numpy as np
import os
import time

from image_feature_extractor import ImageFeatureExtractor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#install pillow if necessary
!pip install pillow



In [3]:
#create image_extractor object
image_extractor = ImageFeatureExtractor()

In [4]:
#read labels into dataframe(make sure file path is correct)
ships = pd.read_csv('./ships/train.csv')

In [5]:
#view dataframe
ships.head()

Unnamed: 0,image,category
0,2823080.jpg,1
1,2870024.jpg,1
2,2662125.jpg,2
3,2900420.jpg,3
4,2804883.jpg,2


In [6]:
ships.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6252 entries, 0 to 6251
Data columns (total 2 columns):
image       6252 non-null object
category    6252 non-null int64
dtypes: int64(1), object(1)
memory usage: 97.8+ KB


In [7]:
#count number of each class of ships
ships['category'].value_counts()

1    2120
5    1217
2    1167
3     916
4     832
Name: category, dtype: int64

In [8]:
#set file path for each image
ships['image_path'] = ships['image'].apply(lambda x: os.path.join('./ships/images', x))

In [9]:
ships.head()

Unnamed: 0,image,category,image_path
0,2823080.jpg,1,./ships/images/2823080.jpg
1,2870024.jpg,1,./ships/images/2870024.jpg
2,2662125.jpg,2,./ships/images/2662125.jpg
3,2900420.jpg,3,./ships/images/2900420.jpg
4,2804883.jpg,2,./ships/images/2804883.jpg


In [10]:
#extract deep features for each image based on file path
#may take 3-5 minutes to extract features from all images
start = time.time()

ships_dp_feat = image_extractor.transform(ships['image_path'])

end = time.time()

print(end-start)

204.29581093788147


In [11]:
#set target variable
labels = ships['category']

In [12]:
#split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(ships_dp_feat, labels, test_size=0.3, random_state=88)

In [13]:
#train logistic regression
#may take some time due to high number of iterations required for convergence

lr = LogisticRegression(C=1, solver='lbfgs', max_iter=2000, multi_class='multinomial')
lr.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
#predict for training and test sets
train_preds = lr.predict(X_train)
test_preds = lr.predict(X_test)

In [15]:
#view model performance for test set
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           1       0.81      0.82      0.81       642
           2       0.93      0.90      0.92       359
           3       0.93      0.93      0.93       269
           4       0.95      0.91      0.93       246
           5       0.70      0.73      0.71       360

    accuracy                           0.84      1876
   macro avg       0.86      0.86      0.86      1876
weighted avg       0.85      0.84      0.85      1876



In [16]:
#view model performance for train set(overfit)
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1478
           2       1.00      1.00      1.00       808
           3       1.00      1.00      1.00       647
           4       1.00      1.00      1.00       586
           5       0.99      0.99      0.99       857

    accuracy                           1.00      4376
   macro avg       1.00      1.00      1.00      4376
weighted avg       1.00      1.00      1.00      4376



In [17]:
#train RF classifier
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [18]:
#predict for train and test sets
train_preds = rfc.predict(X_train)
test_preds = rfc.predict(X_test)

In [19]:
#view model performance for test set
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           1       0.71      0.90      0.80       642
           2       0.90      0.92      0.91       359
           3       0.95      0.85      0.90       269
           4       0.96      0.89      0.93       246
           5       0.81      0.51      0.63       360

    accuracy                           0.82      1876
   macro avg       0.87      0.82      0.83      1876
weighted avg       0.83      0.82      0.82      1876



In [20]:
#view model performance for train set(overfit)
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1478
           2       1.00      1.00      1.00       808
           3       1.00      1.00      1.00       647
           4       1.00      1.00      1.00       586
           5       1.00      1.00      1.00       857

    accuracy                           1.00      4376
   macro avg       1.00      1.00      1.00      4376
weighted avg       1.00      1.00      1.00      4376

