In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from bs4 import BeautifulSoup
from shapely.geometry import Polygon
from uuid import uuid4
from tqdm import tqdm

import cv2
import pandas as pd
import matplotlib.pyplot as plt
import os

## Extract Objects

In [2]:
df = pd.read_json('./segmentations/annotation_statistics.json')

df.head()

Unnamed: 0,name,height,width,area,river_area,river_relative_to_image,water_hyacinths,water_hyacinth_bounds,wh_count,garbage,garbage_bounds,g_count,boats,b_count,others,o_count
0,098a1d68568bc.jpg,2250,4000,9000000,7109300.0,0.789922,"1601.45,700.06,2158.59,212.82,988.12,702.63","2248.29,505.18,2328.55,543.52|262.39,481.43,30...",6,"450.63,112.58","409.12,689.54,450.03,706.51|2405.75,520.60,241...",2,,0,161.35,1
1,0bcf8ec7197b6.jpg,2250,4000,9000000,5248960.0,0.583218,"3491.98,3261.25","436.94,834.35,599.95,871.16|2475.63,1007.40,26...",2,"250.78,2111.14,372.59","1381.88,1933.80,1407.25,1951.39|1953.04,1396.7...",3,,0,,0
2,10b9ed1b6f7a1.jpg,6000,8000,48000000,33001060.0,0.687522,"504.41,10393862.49,3545.15,9387.88,1263.40,303...","1788.27,1903.47,1838.92,1937.82|3285.56,2639.7...",7,1904.18,"510.80,3191.46,628.97,3271.91",1,,0,,0
3,1488a698e63bd.jpg,2250,4000,9000000,6838209.0,0.759801,"833.01,673.73,731.12","3808.70,726.13,3859.95,754.70|1448.01,768.40,1...",3,,,0,,0,,0
4,154f0cb77c45e.jpg,2250,4000,9000000,7526385.0,0.836265,"492.94,222.14","3486.60,397.46,3523.62,415.46|1356.79,467.61,1...",2,,,0,,0,,0


In [3]:
metadata = {
    'filename': [],
    'type': [],
}

for i, row in tqdm(df.iterrows()):
    img = cv2.imread('./images/images/%s' % row['name'])
    img = cv2.resize(img, dsize=(640, 360), interpolation=cv2.INTER_LINEAR)
    scaleX, scaleY = (640 / row['width'], 360 / row['height'])
    outputs = []
    for wh_bound in row['water_hyacinth_bounds'].split('|'):
        if(wh_bound != ''):
            minx, miny, maxx, maxy = [round(float(val)) for val in wh_bound.split(',')]
            minx = round(minx * scaleX)
            maxx = round(maxx * scaleX)
            miny = round(miny * scaleY)
            maxy = round(maxy * scaleY)
            area = (maxx - minx) * (maxy - miny)
#             print(minx, miny, maxx, maxy, area)
            if(area > 50):
                filename = '%s/images/water_hyacinths/%s.jpg' % (os.getcwd(), uuid4())
                
                new_img = cv2.resize(img[miny:miny+maxy - miny, minx:minx+maxx - minx].copy(), dsize=(128, 128), interpolation=cv2.INTER_LINEAR)
                result = cv2.imwrite(filename, new_img)
                if(result):
                    metadata['filename'].append(filename)
                    metadata['type'].append('water_hyacinth')
    
    for g_bound in row['garbage_bounds'].split('|'):
        if(g_bound != ''):
            minx, miny, maxx, maxy = [round(float(val)) for val in g_bound.split(',')]
            minx = round(minx * scaleX)
            maxx = round(maxx * scaleX)
            miny = round(miny * scaleY)
            maxy = round(maxy * scaleY)
            area = (maxx - minx) * (maxy - miny)
#             print(minx, miny, maxx, maxy, area)
            if(area > 50):
                filename = '%s/images/garbages/%s.jpg' % (os.getcwd(), uuid4())
                
                new_img = cv2.resize(img[miny:miny+maxy - miny, minx:minx+maxx - minx].copy(), dsize=(128, 128), interpolation=cv2.INTER_LINEAR)
                result = cv2.imwrite(filename, new_img)
                if(result):
                    metadata['filename'].append(filename)
                    metadata['type'].append('garbage')

36it [00:05,  7.02it/s]


KeyboardInterrupt: 

In [None]:
new_df = pd.DataFrame(metadata)

new_df.head()

In [None]:
len(new_df)

In [26]:
new_df['type'].value_counts()

water_hyacinth    653
garbage           205
Name: type, dtype: int64

In [27]:
new_df.to_csv('objects_metadata.csv', index=False)

## Import Dataset

In [2]:
new_df = pd.concat([pd.read_csv('objects_metadata.csv'), pd.read_csv('misclassified_metadata.csv')])

new_df.head()

Unnamed: 0,filename,type
0,C:\Users\chris\OneDrive\Documents\School\Thesi...,water_hyacinth
1,C:\Users\chris\OneDrive\Documents\School\Thesi...,water_hyacinth
2,C:\Users\chris\OneDrive\Documents\School\Thesi...,water_hyacinth
3,C:\Users\chris\OneDrive\Documents\School\Thesi...,water_hyacinth
4,C:\Users\chris\OneDrive\Documents\School\Thesi...,garbage


In [4]:
new_df['type'].value_counts()

water_hyacinth    653
garbage           205
misclassified      45
Name: type, dtype: int64

In [5]:
images = []

for i, row in new_df.iterrows():
    img = cv2.imread(row['filename'])
    images.append(img)
    
images = np.array(images)
images.shape

(903, 128, 128, 3)

In [6]:
X = images.reshape(len(images), -1)
print(X.shape)
y = new_df['type']

(903, 49152)


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [26]:
from sklearn.svm import SVC

model = SVC(kernel='linear', gamma='auto', random_state=0)
model.fit(X_train, y_train)

SVC(gamma='auto', kernel='linear', random_state=0)

In [27]:
pred = model.predict(X_test)

print(classification_report(y_test,pred))

                precision    recall  f1-score   support

       garbage       0.53      0.57      0.55        79
 misclassified       0.00      0.00      0.00        16
water_hyacinth       0.77      0.81      0.79       176

      accuracy                           0.69       271
     macro avg       0.43      0.46      0.45       271
  weighted avg       0.66      0.69      0.67       271

