# IMAGE CLASSIFICATION MINI PROJECT

#### THIS PROGRAM CLASSIFIES IMAGES INTO CATS AND DOGS

In [93]:
import os
from bing_image_downloader import downloader
from PIL import Image
import numpy as np
import requests
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as tts
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

## GATHERING DATA

In [2]:
path = os.getcwd()
os.mkdir('images')
os.chdir(os.path.join(path, 'images'))

In [128]:
labels = ['cat', 'dog']
animals = {}
for i, an in enumerate(labels):
    animals[i] = an

#DOWNLOADING IMAGES (ALREADY DOWNLOADED ONCE)

'''for label in labels:
    downloader.download(label, limit = 1000)'''


'for label in labels:\n    downloader.download(label, limit = 1000)'

## PREPROCESSING DATA

In [4]:
path = 'C:\\Users\\anike\\Desktop\\SmartKnower Mini Project\\images\\dataset'
os.chdir(path)

In [5]:
# 0 : cat
# 1 : dog

img_X = []
img_Y = []
for label in os.listdir():
    new_path = os.path.join(path, label)
    for img in os.listdir(new_path):
        img_path = os.path.join(new_path, img)
        im = Image.open(img_path)
        im = im.resize((256, 256))
        img_arr = np.asarray(im)
        img_X.append(img_arr.flatten())
        img_Y.append(label == 'dog')

### DIMENSIONALITY REDUCTION

In [6]:
pca = PCA()
x_pca = pca.fit_transform(img_X)

In [7]:
#choosing optimal k

k = 0
var = 0
total_var = pca.explained_variance_.sum()
while var / total_var < 0.99 :
    var += pca.explained_variance_[k]
    k += 1
k   #clearly, only 171 dimensions are required to retain 99% of original data

257

In [8]:
pca = PCA(n_components = k)
X = pca.fit_transform(img_X)
Y = np.asarray(img_Y, dtype = int)

## SPLITTING DATA

In [9]:
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size = 0.2, random_state = 0)

## APPLYING VARIOUS CLASSIFICATION ALGORITHMS

### SVM

In [14]:
clf_svm = SVC()
clf_svm.fit(X_train, Y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [15]:
clf_svm.score(X_test, Y_test)

1.0

### KNN

In [23]:
clf_knn = KNeighborsClassifier(n_neighbors = 3)
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [24]:
clf_knn.score(X_test, Y_test)

0.9975

### DECISION TREES

In [27]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [28]:
clf_tree.score(X_test, Y_test)

1.0

### RANDOM FORESTS

In [30]:
clf_forest = RandomForestClassifier()
clf_forest.fit(X_train, Y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
clf_forest.score(X_test, Y_test)

1.0

### LOGISTIC REGRESSION

In [33]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
clf_lr.score(X_test, Y_test)

1.0

### NAIVE BAYES

In [36]:
clf_nb = GaussianNB()
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [37]:
clf_nb.score(X_test, Y_test)

0.82

## ANALYSING PREDICTIONS

In [83]:
def cm_and_cr(clf):
    pred = clf.predict(X_test)
    print('CONFUSION MATRIX : ')
    print(confusion_matrix(Y_test, pred))
    print('\n\nCLASSIFICATION REPORT : ')
    print(classification_report(pred, Y_test))

In [84]:
cm_and_cr(clf_svm)

CONFUSION MATRIX : 
[[200   0]
 [  0 200]]


CLASSIFICATION REPORT : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [85]:
cm_and_cr(clf_knn)

CONFUSION MATRIX : 
[[199   1]
 [  0 200]]


CLASSIFICATION REPORT : 
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       199
           1       1.00      1.00      1.00       201

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [86]:
cm_and_cr(clf_tree)

CONFUSION MATRIX : 
[[200   0]
 [  0 200]]


CLASSIFICATION REPORT : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [87]:
cm_and_cr(clf_forest)

CONFUSION MATRIX : 
[[200   0]
 [  0 200]]


CLASSIFICATION REPORT : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [88]:
cm_and_cr(clf_lr)

CONFUSION MATRIX : 
[[200   0]
 [  0 200]]


CLASSIFICATION REPORT : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [89]:
cm_and_cr(clf_nb)

CONFUSION MATRIX : 
[[164  36]
 [ 36 164]]


CLASSIFICATION REPORT : 
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       200
           1       0.82      0.82      0.82       200

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



## PREDICT FOR NEW IMAGE

In [95]:
#it is an image of a cat

im = Image.open(requests.get(r'https://img.webmd.com/dtmcms/live/webmd/consumer_assets/site_images/article_thumbnails/other/cat_relaxing_on_patio_other/1800x1200_cat_relaxing_on_patio_other.jpg', stream=True).raw)

In [133]:
im = im.resize((256, 256))
p = np.asarray(im).flatten()
l = p.reshape(1, -1)
a = pca.transform(l)
pred_new = clf_svm.predict(a)[0]

In [134]:
animals[pred_new]  #images has been correctly classified

'cat'