In [93]:
from tqdm import tqdm
import os
import skimage
import skimage.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# 1. `input`

In [2]:
indoor = 'indoor/'
outdoor = 'outdoor/'

In [3]:
files_indoor = os.listdir(indoor)
files_outdoor = os.listdir(outdoor)

In [4]:
len(files_indoor), len(files_outdoor)

(28106, 27216)

In [5]:
f_indoor = [indoor+i for i in files_indoor]
f_outdoor = [outdoor+i for i in files_outdoor]
indoor_images = skimage.io.imread_collection(f_indoor)
outdoor_images = skimage.io.imread_collection(f_outdoor)
assert len(indoor_images) == len(files_indoor)
assert len(outdoor_images) == len(files_outdoor)

# 2. `Histograms: features`

Возьмём следующие признаки: гистограммы по компонентам, среднее значение гистограмм, максимум и минимум.

In [108]:
columns = [f'red_{i}' for i in np.arange(256)] + \
       [f'green_{i}' for i in np.arange(256)] + \
       [f'blue_{i}' for i in np.arange(256)] + ['mean', 'max', 'min']
df = pd.DataFrame(columns=columns)

In [109]:
def get_histograms(image):
    red, green, blue = image[:, :, 0], image[:, :, 1], image[:, :, 2]
    red_freq = np.bincount(red.ravel(), minlength=256)
    green_freq = np.bincount(green.ravel(), minlength=256)
    blue_freq = np.bincount(blue.ravel(), minlength=256)
    features = np.hstack((red_freq, green_freq, blue_freq, image.mean(), image.max(), image.min()))
    return features

In [110]:
for image in tqdm(indoor_images, desc="Indoor Images"):
    df.loc[len(df)] = get_histograms(image)

Indoor Images: 100%|█████████████████████████████████████████████████████████████| 28106/28106 [31:12<00:00, 15.01it/s]


In [111]:
for image in tqdm(outdoor_images, desc="Outdoor Images"):
    df.loc[len(df)] = get_histograms(image)

Outdoor Images: 100%|██████████████████████████████████████████████████████████| 27216/27216 [1:30:44<00:00,  5.00it/s]


У меня недолго обучалось - использовал просто более быструю функцию `np.bincount`

In [112]:
df.shape

(55322, 771)

# 3. `Train`

In [113]:
df['class'] = np.r_[np.ones(len(files_indoor)), np.zeros(len(files_outdoor))]

In [114]:
X = df.drop('class', axis=1).values
y = df['class'].values

In [117]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [122]:
results = {}
cl = KFold(n_splits=10)

In [123]:
for splt, (train_index, test_index) in enumerate(cl.split(X), 1):
    results[f'fold {splt}'] = {'RandomForestClassifier(100)': 0}
    print('Fitting')
    rfc = RandomForestClassifier(n_estimators=100, random_state=np.random.randint(0, 1000))
    rfc.fit(X[train_index], y[train_index])
   
    results[f'fold {splt}']['RandomForestClassifier(100)'] = accuracy_score(rfc.predict(X[test_index]), y[test_index])
    print(f'Done Fitting fold {splt}')

Fitting
Done Fitting fold 1
Fitting
Done Fitting fold 2
Fitting
Done Fitting fold 3
Fitting
Done Fitting fold 4
Fitting
Done Fitting fold 5
Fitting
Done Fitting fold 6
Fitting
Done Fitting fold 7
Fitting
Done Fitting fold 8
Fitting
Done Fitting fold 9
Fitting
Done Fitting fold 10


In [124]:
results

{'fold 1': {'RandomForestClassifier(100)': 0.8129405385866618},
 'fold 2': {'RandomForestClassifier(100)': 0.8140249412615218},
 'fold 3': {'RandomForestClassifier(100)': 0.8206796818510484},
 'fold 4': {'RandomForestClassifier(100)': 0.8333333333333334},
 'fold 5': {'RandomForestClassifier(100)': 0.8302603036876356},
 'fold 6': {'RandomForestClassifier(100)': 0.8199566160520607},
 'fold 7': {'RandomForestClassifier(100)': 0.8049530007230657},
 'fold 8': {'RandomForestClassifier(100)': 0.8022415039768619},
 'fold 9': {'RandomForestClassifier(100)': 0.7977223427331888},
 'fold 10': {'RandomForestClassifier(100)': 0.789587852494577}}

In [127]:
np.mean([results[k]['RandomForestClassifier(100)'] for k in results.keys()])

0.8125700114699954

Получили вполне неплохую точность верных ответов.