In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import time
import pickle
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

In [19]:
#CSV file reading
"""
Generating dictionaries
for photo_id ---> buisness_id
"""
photo_to_bus_dict = {}
with open('train_photo_to_biz_ids.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            photo_to_bus_dict[row[0]] = row[1]
            line_count += 1
    print(f'Processed {line_count} lines.')
print(len(photo_to_bus_dict))
    
"""
Generating dictionaries
for business_id ---> labels
"""
bus_to_labels_dict = {}     
with open('train.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            bus_to_labels_dict[row[0]] = [row[1]]
            line_count += 1
    print(f'Processed {line_count} lines.')  
print(len(bus_to_labels_dict))

"""
Checking the distribution of labels
in the training set
"""
hist_label = [0]*9
for filename in os.listdir('training_set'):
    photo_id = filename.split('.')
    lst_labels = bus_to_labels_dict[photo_to_bus_dict[photo_id[0]]]
#     print(lst_labels)
    labels = lst_labels[0].split(' ')
#     print(labels)
    if len(labels)!=0:
        for i in labels:
            if i is not '':
                hist_label[int(i)]+=1

print("Histogram distribution for the labels in the training_set")
print(hist_label)

Column names are photo_id, business_id
Processed 234843 lines.
234842
Column names are business_id, labels
Processed 2001 lines.
2000
Histogram distribution for the labels in the training_set
[9163, 23679, 25603, 19453, 16829, 29454, 30772, 14825, 20300]


In [20]:
"""
Get the training data
Returns images and respective photo ids
"""
def get_train_data():
    data = []
    photo_ids = []
    count=0
    for filename in sorted(os.listdir('training_set/'),key=lambda x: int(os.path.splitext(x)[0])):
        if count%5000==4999:
            print(filename)
        photo_ids.append(filename.split('.')[0])
        data.append(cv2.imread('training_set/'+filename,0))
        count+=1
    return data,photo_ids
    
train_data, train_photo_ids = get_train_data()
print(len(train_data))

10195.jpg
20062.jpg
29986.jpg
40137.jpg
50092.jpg
59987.jpg
70098.jpg
79934.jpg
40000


In [12]:
"""
Get the testing data
Returns images and respective photo ids
"""
def get_test_data():
    data = []
    photo_ids = []
    count=0
    for filename in sorted(os.listdir('testing_set/'),key=lambda x: int(os.path.splitext(x)[0])):
        if count%5000==4999:
            print(filename)
        photo_ids.append(filename.split('.')[0])
        data.append(cv2.imread('testing_set/'+filename,0))
        count+=1
    return data,photo_ids
    
test_data, test_photo_ids = get_test_data()
print(len(test_data))

10195.jpg
20062.jpg
10000


In [13]:
"""
Creates no_images*labels_size matrix
with each row indicating what labels
are assigned to each photo
"""
def id_to_label(photo_ids):
    photo_labels = np.empty(shape=[0,9])
    for id in photo_ids:
        lst_labels = bus_to_labels_dict[photo_to_bus_dict[id]]
        ls = lst_labels[0].split(' ')
        labels = [0]*9
        for label in ls:
            if label !='':
                labels[int(label)] = 1
        labels = np.array(labels).reshape(1,-1)
        photo_labels = np.append(photo_labels,labels,axis=0)
    return photo_labels

In [21]:
#Training set
"""
Contains no_images*labels_size matrix
with each row indicating what labels
are assigned to each photo
"""
train_photo_labels = id_to_label(train_photo_ids)
print(train_photo_labels[0])
print(bus_to_labels_dict[photo_to_bus_dict[train_photo_ids[0]]])
print(train_photo_labels.shape)
print(train_photo_labels[:,0].shape)

[ 0.  1.  1.  0.  1.  1.  1.  1.  0.]
['1 2 4 5 6 7']
(40000, 9)
(40000,)


In [16]:
#Testing set
"""
Contains no_images*labels_size matrix
with each row indicating what labels
are assigned to each photo
"""
test_photo_labels = id_to_label(test_photo_ids)
print(test_photo_labels[0])
print(bus_to_labels_dict[photo_to_bus_dict[test_photo_ids[0]]])
print(test_photo_labels.shape)
print(test_photo_labels[:,0].shape)

[ 0.  1.  1.  0.  1.  1.  1.  1.  0.]
['1 2 4 5 6 7']
(10000, 9)
(10000,)


In [3]:
"""
Extracting descriptors for training data
"""
t0 = time.time()
with open("descriptors_train.sav",'rb') as fp:
    train_individual_des, train_des=pickle.load(fp)

print("Shape of overall training images descriptors:-")
print(train_des.shape)
print("Extracting training images descriptors took %0.3fs." % (time.time() - t0))

Shape of overall training images descriptors:-
(3132260, 128)
Extracting training images descriptors took 94.181s.


In [4]:
"""
Extracting descriptors for testing data
"""
t0 = time.time()
with open("descriptors_test.sav",'rb') as fp:
    test_individual_des, test_des =pickle.load(fp)

print("Shape of overall test images descriptors:-")
print(test_des.shape)
print("Extracting test images descriptors took %0.3fs." % (time.time() - t0))

Shape of overall test images descriptors:-
(782829, 128)
Extracting test images descriptors took 86.721s.


In [88]:
"""
Generating cluster using K-Means
"""
t0 = time.time()
no_clusters = 50
kmeans = KMeans(no_clusters).fit(train_des)
print("Training the K-Means clustering took %0.3fs." % (time.time() - t0))

filename = "clusters/bov_pickle_"+str(no_clusters)+"_step_"+str(step)+"_scale_"+str(scale)+".sav"
pickle.dump(kmeans, open(filename, 'wb'))
print("Model saved to file: " + str(filename))

Training the K-Means clustering took 2821.547s.
Model saved to file: clusters/bov_pickle_50_step_15_scale_20.sav


In [5]:
"""
Generating histograms for train/test
based on the clusters that we get K-Means
"""
def create_histogram(individual_data, kmeans,no_clusters):
    hist = np.empty(shape=[0,no_clusters])
    for i in individual_data:
        pred_te = kmeans.predict(i)
        h = np.histogram(pred_te,bins = np.arange(0,no_clusters+1))
        total = i.shape[0]
        tmp = []
        for j in range(0,no_clusters):
            tmp.append(h[0][j])
        tmp = np.asarray(tmp)
        tmp = tmp.reshape(1, no_clusters)
        tmp = tmp/(total*1.0)
        hist= np.append(hist,tmp,axis=0)
    return hist

In [22]:
#For loading the model

with open("descriptors_test_cluster.sav",'rb') as fp:
    kmeans=pickle.load(fp)
no_clusters = 50

"""
Creating histograms or Bag of visual words
for training and testing descriptors using K-Means cluster from before
"""
t0 = time.time()
train_hist = create_histogram(train_individual_des, kmeans, no_clusters)
test_hist = create_histogram(test_individual_des, kmeans, no_clusters)
print("Calculating histograms took %0.3fs." % (time.time() - t0))



Calculating histograms took 1005.718s.


In [7]:
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn import linear_model
import seaborn as sns
sns.set()

In [130]:
def verify_correctness(pred, test_labels, threshold=0):
    cnt=0
    for x, y in zip(pred, test_labels):
        x1= np.array(x)
        y1 = np.array(y)
        c=0
        for a, b in zip(x1, y1):
            if(a!=b): c+=1
        if c<=threshold:
            cnt+=1
    accuracy = cnt/len(test_labels)
    return accuracy


In [178]:
def precision(y_true, y_pred):
    val=set(y_true).intersection(y_pred)
    length= len(y_pred)
    if length==0:
        return 0
    else:
        return len(val)/length

def recall(y_true, y_pred):
    val= set(y_true).intersection(y_pred)
    return len(val)/len(y_true)

def f1(y_true, y_pred):
    p =precision(y_true, y_pred)
    r =recall(y_true, y_pred)
    if (p+r)==0:
        return 0
    else:
        return 2*(p*r)/(p+r)

def Evaluate_accuracy(pred, true_value, model_name):
    
    print("Accuracy score for strict match is ", verify_correctness(pred, true_value, 0)*100)
    print("Accuracy score for maximum of 1 mistmatch is ", verify_correctness(pred, true_value, 1)*100)
    print("Accuracy score for maximum of 2 mistmatch is ", verify_correctness(pred, true_value, 2)*100)

    print("Average Precision score ", average_precision_score(true_value, pred, average='macro'))

    f_ans = 0
    for p, tl in zip(pred, true_value):
        f_ans+=f1(tl, p)
    

In [179]:
print("Nearest Neighbor model: ")

nnb = NearestNeighbors(n_neighbors=1, metric='euclidean')
nnb.fit(train_hist, train_photo_labels)
#Saving the trained model
joblib.dump(nnb,'nnbr1.model')
#Loading from the trained model
nnb = joblib.load('nnbr1.model')
dist, predictions = nnb.kneighbors(test_hist)
pred1 = []

for each in predictions:
    pred1.append(train_photo_labels[each[0]])

Evaluate_accuracy(pred1, test_photo_labels, "nn")

print("Decision Tree model: ")

pred2 = []
dt = DecisionTreeClassifier(max_depth=5)
#Fitting the training data over the regression model
dt.fit(train_hist, train_photo_labels)
#Saving the trained model
joblib.dump(dt,'dtr1.model')
#Loading from the trained model
dtr1 = joblib.load('dtr1.model')
# Make predictions on the test set using the fit model.
pred2 = dtr1.predict(test_hist)

Evaluate_accuracy(pred2, test_photo_labels, "dtc")

print("Random Forest model: ")

pred3=[]
rfc = RandomForestClassifier(n_estimators = 100,min_samples_split=2,max_depth=10, n_jobs=-1)
rfc.fit(train_hist, train_photo_labels)
#Saving the trained model
joblib.dump(rfc,'rfc1.model')
#Loading from the trained model
rfc1 = joblib.load('rfc1.model')
# Make predictions on the test set using the fit model.
pred3 = rfc1.predict(test_hist)

Evaluate_accuracy(pred3, test_photo_labels, "rfc")

print("Gradient Boost model: ")

pred4=[]
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
multi = MultiOutputClassifier(gbc, n_jobs=-1)
multi.fit(train_hist, train_photo_labels)
#Saving the trained model
joblib.dump(multi,'gbc1.model')
#Loading from the trained model
gbc1 = joblib.load('gbc1.model')
# Make predictions on the test set using the fit model.
pred4 =gbc1.predict(test_hist)

Evaluate_accuracy(pred4, test_photo_labels, "gbc")

print("SGD Classifier model: ")

pred5=[]
sgd = SGDClassifier()
multi = MultiOutputClassifier(sgd, n_jobs=-1)
multi.fit(train_hist, train_photo_labels)
pred5 =multi.predict(test_hist)

Evaluate_accuracy(pred5, test_photo_labels, "sgd")

Nearest Neighbor model: 
Accuracy score for strict match is  36.47
Accuracy score for maximum of 1 mistmatch is  44.32
Accuracy score for maximum of 2 mistmatch is  54.620000000000005
Average Precision score  0.761746369515
F1 score: 0.22207777777774018
Decision Tree model: 
Accuracy score for strict match is  40.53
Accuracy score for maximum of 1 mistmatch is  47.699999999999996
Accuracy score for maximum of 2 mistmatch is  57.78
Average Precision score  0.779817148984
F1 score: 0.2221111111110735
Random Forest model: 
Accuracy score for strict match is  35.449999999999996
Accuracy score for maximum of 1 mistmatch is  43.4
Accuracy score for maximum of 2 mistmatch is  54.54
Average Precision score  0.759855796946
F1 score: 0.2220888888888513
Gradient Boost model: 
Accuracy score for strict match is  39.410000000000004
Accuracy score for maximum of 1 mistmatch is  46.56
Accuracy score for maximum of 2 mistmatch is  57.67
Average Precision score  0.793875850777
F1 score: 0.2221444444444