In [25]:
## packages
import collections
import numpy as np
import textblob as tb
from textblob import TextBlob as TB
import math
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from scipy.spatial import distance
stopwords = set(stopwords.words('english'))

import pprint as pp
import pandas as pd

[nltk_data] Downloading package stopwords to /Users/Shawn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
# Regressors

from sklearn.ensemble import RandomForestRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.linear_model import Lasso
# from sklearn.linear_model import ElasticNet
# from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

# from sklearn.ensemble import ExtraTreesRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.ensemble import AdaBoostRegressor

# from sklearn.tree import DecisionTreeRegressor

from sklearn.multioutput import MultiOutputRegressor

from sklearn.cross_decomposition import PLSRegression

### Spliting training and validation set

In [28]:
## random split to 8000:2000 (train:validate)
# training_indice = np.random.choice(10000, size=8000, replace=False)
# validation_indice = [item for item in range(10000) if item not in training_indice]

## fix split to 8000:2000 (train:validate)
train_indice = range(10000)
# validate_indice = range(8000,10000)
test_indice = range(2000)

### Reform each tag file into a dictionary(key: category, value: tags array), collect all these dictionaries

In [29]:
## translate all tag_file into a dictionary and collect them into an array
def tag_reform_collection(source,indice):
    tags_records = []
    for i in indice:
        current_dict = collections.defaultdict(list)
        path = source + str(i) +".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            current=line.split(":")[0]
            current_dict[current].append(line.split(":")[1])
        tags_records.append(dict(current_dict))
    return tags_records
        
train_tags_records = tag_reform_collection("data/tags_train/",train_indice)
# validate_tags_records = tag_reform_collection("data/tags_train/",validate_indice)
test_tags_records = tag_reform_collection("data/tags_test/",test_indice)

In [217]:
## output
# pp.pprint(train_tags_records[0:2])
# pp.pprint(validate_tags_records[0:2])
# pp.pprint(test_tags_records[0:2])

### Build a set of categories (12 categories) for training, validation, and testing set (should be same)

In [30]:
def cate_set(source,indice):
    cate_set = set([])
    for i in indice:
        path = source + str(i) +".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            cate_set.add(line.split(":")[0])
    return cate_set

train_cate_set = cate_set("data/tags_train/",train_indice)
# validate_cate_set = cate_set("data/tags_train/",validate_indice)
test_cate_set = cate_set("data/tags_test/",test_indice)

In [31]:
## output
# pp.pprint(train_cate_set)
# pp.pprint(validate_cate_set)
# pp.pprint(test_cate_set)

# these category set comes to be exactly the same!
cate_set = train_cate_set
# pp.pprint(cate_set)

### Build category-tags dictionary in training, validation, and testing set (should be same)

In [32]:
def cate_tags_dictionary(indice, tags_records):
    tags_dic = {}
    for cate in cate_set:
        tags=set()
        for i in range(len(indice)):
            if cate in tags_records[i]:
                tags=tags.union(tags_records[i][cate])
        tags=list(tags)
        tags.sort()
        tags_dic[cate] = tags
    return tags_dic

train_tags_dic = cate_tags_dictionary(train_indice, train_tags_records)
# validate_tags_dic = cate_tags_dictionary(validate_indice, validate_tags_records)
test_tags_dic = cate_tags_dictionary(test_indice, test_tags_records)

In [33]:
## train_tags_dic, validate_tags_dic, and test_tags_dic are the same!
tags_dic = train_tags_dic
# tags_dic

### *Building labels for each categories!

In [34]:
def build_labels(indice,tags_records):
    image_labels = dict([(key, []) for key in tags_dic])
    for i in range(len(indice)):
        for k in tags_dic.keys():
            label=[0 for d in range(len(tags_dic[k]))]
            if k in tags_records[i]:
                for index,elt in enumerate(tags_dic[k]):
                    if elt in tags_records[i][k]:
                        label[index]=1
            image_labels[k].append(label)
    return image_labels
            
train_image_labels = build_labels(train_indice,train_tags_records)
# validate_image_labels = build_labels(validate_indice,validate_tags_records)
test_image_labels = build_labels(test_indice,test_tags_records)

In [35]:
## All label-matrix for all tag-categories (ready for train)
# train_image_labels

### *Extracting features!

#### 1） Build feature space (and count the word frequency)

In [36]:
%%time
total_count = collections.defaultdict(int)

for i in train_indice:
    path = "data/descriptions_train/" + str(i) + ".txt"
    fo = open(path, "r")
    for line in fo.readlines():
        line = line.strip()
        ret = TB(line).lower()
        
        sentence_nouns=[]
        sentence_adjs=[]
        for word,pos in ret.tags:
            if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') and word not in stopwords:
                sentence_nouns.append(tb.blob.Word(word).singularize())
#             if (pos == 'JJ' or pos == 'JJR' or pos == 'JJS') and word not in stopwords:
#                 sentence_adjs.append(tb.blob.Word(word).singularize())
        
        for index in range(len(sentence_nouns)):
            total_count[sentence_nouns[index]] += 1
#         for index in range(len(sentence_adjs)):
#             total_count[sentence_adjs[index]] += 1

CPU times: user 1min 6s, sys: 3.47 s, total: 1min 9s
Wall time: 1min 21s


###### reduce the feature set

In [37]:
# final feature set
reduced_nouns_dic = dict((k, v) for k, v in total_count.items() if (len(k.split(" "))==1 and v>=3))

#### 2) count the nouns for each description

In [38]:
%%time
## -> noun : count
def generate_nouns(file_path, indice):
    image_nouns = []
    for i in indice:
        current_count = collections.defaultdict(int)
        current_nouns = collections.defaultdict(float)
        path = file_path + str(i) + ".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            ret = TB(line).lower()
            
            sentence_nouns=[]
            for word,pos in ret.tags:
                if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') and word not in stopwords:
                    sentence_nouns.append(tb.blob.Word(word).singularize())

            for index in range(len(sentence_nouns)):
                current_count[sentence_nouns[index]] += 1
                current_nouns[sentence_nouns[index]] = math.log2(current_count[sentence_nouns[index]] + 1)

            for word_phrase in ret.noun_phrases:
                ret_phrase = TB(word_phrase).lower()
                for word,pos in ret_phrase.tags:
                    if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
                        temp_noun = tb.blob.Word(word).singularize()
                        current_count[temp_noun] += 1
                        current_nouns[temp_noun] = math.log2(current_count[temp_noun] + 1)

        image_nouns.append(dict(current_nouns))
    return image_nouns

train_image_nouns = generate_nouns("data/descriptions_train/",train_indice)
# validate_image_nouns = generate_nouns("data/descriptions_train/",validate_indice)
test_image_nouns = generate_nouns("data/descriptions_test/",test_indice)


CPU times: user 1min 55s, sys: 3.19 s, total: 1min 58s
Wall time: 2min 1s


#### 3) prepare inputs

In [39]:
## generate training data for each image (feature)
feature_name_list = reduced_nouns_dic.keys()
def generate_input_features(indice, image_nouns):
    image_nouns_features=[]
    for i in range(len(indice)):
        label=[0 for d in range(len(feature_name_list))]
        for index,k in enumerate(feature_name_list):
            if k in image_nouns[i]:
                label[index]=image_nouns[i][k]
        image_nouns_features.append(label)
    return image_nouns_features


train_image_nouns_features=generate_input_features(train_indice, train_image_nouns)
# validate_image_nouns_features=generate_input_features(validate_indice, validate_image_nouns)
test_image_nouns_features=generate_input_features(test_indice, test_image_nouns)


In [40]:
# train_image_nouns_features
print (len(train_image_nouns_features))
print (len(train_image_nouns_features[0]))
# print (len(test_image_nouns_features))
# print (len(test_image_nouns_features[0]))
# np.matrix(train_image_nouns_features).shape
# np.matrix(train_image_labels['accessory']).shape

10000
2423


### Train models!

In [41]:
%%time

def train_RF():
    regressor_list=[]
    for cate in cate_set:
        print(cate)
        features = np.matrix(train_image_nouns_features)
        labels = np.matrix(train_image_labels[cate])

        regressor = RandomForestRegressor(n_estimators = 5,max_depth=10,random_state=0)
#         regressor = RandomForestRegressor()
        regressor.fit(features, labels)
        regressor_list.append(regressor)
    return regressor_list
        
regressor_list = train_RF()

outdoor
sports
food
appliance
furniture
indoor
person


  # This is added back by InteractiveShellApp.init_path()


accessory
vehicle
kitchen
animal
electronic
CPU times: user 2min 12s, sys: 1.57 s, total: 2min 14s
Wall time: 2min 14s


In [188]:
def train_PLSR():
    regressor_list=[]
    for cate in cate_set:
        print(cate)
        features = np.matrix(train_image_nouns_features)
        labels = np.matrix(train_image_labels[cate])

#         regressor = RandomForestRegressor(n_estimators = 20,min_samples_leaf=10,max_depth=20,random_state=0)
        regressor = PLSRegression(n_components=200, scale=True, max_iter=500, tol=1e-06, copy=True)
        regressor.fit(features, labels)
        regressor_list.append(regressor)
    return regressor_list

# regressor_list = train_PLSR()

food
animal
electronic
accessory
furniture
person
vehicle
kitchen
sports
indoor
appliance
outdoor




In [204]:
def train_MOR():
    regressor_list=[]
    for cate in cate_set:
        print(cate)
        features = np.matrix(train_image_nouns_features)
        labels = np.matrix(train_image_labels[cate])

#         regressor = RandomForestRegressor(n_estimators = 20,min_samples_leaf=10,max_depth=20,random_state=0)
#         regressor = PLSRegression(n_components=200, scale=True, max_iter=500, tol=1e-06, copy=True)
        regressor = MultiOutputRegressor(estimator, n_jobs=None);
        regressor.fit(features, labels)
        regressor_list.append(regressor)
    return regressor_list

# regressor_list = train_PLSR()

food
animal
electronic
accessory
furniture
person
vehicle
kitchen
sports
indoor
appliance
outdoor




In [42]:
def concatenate_labels(image_labels):
    final_labels=None
    for index,cate in enumerate(cate_set):
        temp_matrix = np.matrix(image_labels[cate])
        temp_matrix = temp_matrix
        if index == 0:
            final_labels=temp_matrix
        else:
            final_labels = np.hstack((final_labels,temp_matrix))
    return final_labels

train_final_labels=concatenate_labels(train_image_labels)
# validate_final_labels=concatenate_labels(validate_image_labels)
test_final_labels=concatenate_labels(test_image_labels)

In [44]:
print(train_final_labels.shape)
# print(validate_final_labels.shape)
print(test_final_labels.shape)

(10000, 80)
(2000, 80)


In [45]:
def concatenate_outputs(indice, image_nouns_features, image_labels):
    prediction = None
    for index,cate in enumerate(cate_set):
        features = np.matrix(image_nouns_features)
        labels = np.matrix(image_labels[cate])

        temp_matrix = regressor_list[index].predict(features).reshape(len(indice),-1)
        temp_matrix = temp_matrix
        if index == 0:
            prediction = temp_matrix
        else:
            prediction = np.hstack((prediction,temp_matrix))
    return prediction
            
train_predict = concatenate_outputs(train_indice ,train_image_nouns_features, train_image_labels)
# validate_predict = concatenate_outputs(validate_indice, validate_image_nouns_features, validate_image_labels)
test_predict = concatenate_outputs(test_indice, test_image_nouns_features, test_image_labels)

In [46]:
# print(train_predict.shape)
# print(validate_predict.shape)
# print(test_predict.shape)

In [57]:
def ordering_node(node, nodes):
    ordered_indice = np.argsort(distance.cdist(node, nodes))#
    return ordered_indice

In [47]:
def closest_node(node, nodes):
    print(int(len(node)/100))
    closest_indice = np.argsort(distance.cdist(node, nodes))[:,:int(len(node)/100)]#[:,::-1]
    return closest_indice

In [50]:
train_node = train_predict
train_nodes = train_final_labels

# validate_node = validate_predict
# validate_nodes = validate_final_labels

test_node = test_predict
test_nodes = test_final_labels

In [None]:
# train_match = closest_node(train_node, train_nodes)
# validate_match = closest_node(validate_node, validate_nodes)

# print (train_match)
# print (validate_match)

In [259]:
# train_match_index=[]
# train_score = 0
# for i in range(len(train_indice)):
#     if i in train_match[i]:
#         total=(len(train_indice)/100.0)
#         train_score += (total-list(train_match[i]).index(i))/total
#         train_match_index.append(i)

In [281]:
# validate_match_index=[]
# validate_score = 0

# print (validate_match.shape)

# for i in range(0, len(validate_indice)):
#     if i in validate_match[i]:
#         total=(len(validate_indice)/100.0)
#         validate_score += (total-list(validate_match[i]).index(i))/total
# #         print(list(validate_match[i]).index(i))

# print(validate_score/2000)


(2000, 20)
0.4143250000000001


In [242]:
# print(validate_match[5])
# count = 0
# score = 0
# for i in range(2000):
#     if i in validate_match[i]:
#         count = count + 1
#         score = score + (20  - list(validate_match[i]).index(i)) / 20.0
# print(count)
# print(score/2000)
# print(validate_match.shape)

[1999  724 1633  549 1775 1070 1494  763 1758 1940 1665   91  874  513
 1248  384  183 1890 1746 1257]
1207
0.4143250000000001
(2000, 20)


In [243]:
# print (train_score/len(train_indice))

# print(validate_score/len(validate_indice))

0.8009203125000103
0.4143250000000001


In [248]:
# print(validate_match[1270])
# validate_match[2]

array([   2, 1255,  716, 1904, 1661, 1718, 1663, 1346, 1501, 1898,  974,
       1897,  785,  423,  186,  313, 1931, 1750, 1951, 1567])

In [60]:
train_match_rank_tag = ordering_node(train_node, train_nodes)
test_match = closest_node(test_node, test_nodes)
test_match[0]

20


array([1231,   36,  942, 1131,  292,  359, 1743, 1816, 1698, 1862, 1117,
        949,  255, 1092,  760, 1258, 1105, 1480,   50,  594])

In [52]:
result = []
for i in range(2000):
    a = [str(i) + '.txt']
    b = ''
    for j in range(20):
        b += str(test_match[i][j]) + '.jpg '
    a.append(b)
    result.append(a)

In [55]:
np.savetxt('new.csv', result, delimiter = ',', fmt='%s', comments='', header='Descritpion_ID,Top_20_Image_IDs')

In [None]:
# train_match_rank_tag !!!!!!

In [77]:
train_match_rank_tag

array([[7644, 8885, 4847, ..., 7662, 5553, 9844],
       [   1, 5749, 6345, ..., 9138, 4821, 8890],
       [6235, 4735, 2181, ..., 2097, 9844, 5553],
       ...,
       [4787, 2093, 3735, ..., 9844, 8890, 5411],
       [9109, 4566, 7301, ..., 3519, 8890, 5411],
       [9999, 8290, 3103, ..., 9844, 5411, 5553]])

In [None]:
##################  tag  #################

In [None]:
##################  pool5  #################

In [78]:
def reorder_pd(label, indice):
    new_label=label
    for i in range(indice):
        img=new_label.iloc[i][0]
        # find t, image #
        t=''
        for j in range(len(img)):
            if img[j]=='/':
                if img[j+4]=='.':
                    t+=img[j+1:j+4]
                elif img[j+3]=='.':
                    t+=img[j+1:j+3]
                elif img[j+5]=='.':
                    t+=img[j+1:j+5]
                elif img[j+2]=='.':
                    t+=img[j+1:j+2]
                else:
                    t+=img[j+1]
                
        new_label.set_value(i,0,int(t))
    return new_label.sort_values(by=[0])

In [79]:
# sort train_fc1000
train_fc1 = pd.DataFrame(pd.read_csv("data/features_train/features_resnet1000intermediate_train.csv", header=-1))
train_sorted_fc1=reorder_pd(train_fc1, 10000)
train_sorted_fc1000 = train_sorted_fc1.drop(0, 1).as_matrix()

  after removing the cwd from sys.path.


In [80]:
# sort test_fc1000
test_fc1 = pd.DataFrame(pd.read_csv("data/features_test/features_resnet1000intermediate_test.csv", header=-1))
test_sorted_fc1=reorder_pd(test_fc1, 2000)
test_sorted_fc1000 = test_sorted_fc1.drop(0, 1).as_matrix()

  after removing the cwd from sys.path.


In [81]:
# training, size of 10000
def generate_word_space():
    total_count = collections.defaultdict(int)
    for i in train_indice:
        path = "data/descriptions_train/" + str(i) + ".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            ret = TB(line).lower()

            sentence_words=[]
            for word,pos in ret.tags:
                if (pos in ['NN','NNP','NNS','NNPS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']) and word not in stopwords:
                    sentence_words.append(tb.blob.Word(word).stem())

            for index in range(len(sentence_words)):
                total_count[sentence_words[index]] += 1
    return total_count

## -> noun : count
def generate_nouns(file_path, indice):
    image_nouns = []
    for i in indice:
#         current_count = collections.defaultdict(int)
        current_nouns = collections.defaultdict(float)
        path = file_path + str(i) + ".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            ret = TB(line).lower()
            
            sentence_nouns=[]
            for word,pos in ret.tags:
                if (['NN','NNP','NNS','NNPS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']) and word not in stopwords:
                    sentence_nouns.append(tb.blob.Word(word).stem())

            for index in range(len(sentence_nouns)):
#                 current_count[sentence_nouns[index]] += 1
#                 current_nouns[sentence_nouns[index]] = math.log2(current_count[sentence_nouns[index]] + 1)
                current_nouns[sentence_nouns[index]] = 1

#             for word_phrase in ret.noun_phrases:
#                 ret_phrase = TB(word_phrase).lower()
#                 for word,pos in ret_phrase.tags:
#                     if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS' or pos == 'JJ' or pos == 'JJR' or pos == 'JJS'):
#                         temp_noun = tb.blob.Word(word).singularize()
#                         current_count[temp_noun] += 1
#                         current_nouns[temp_noun] = math.log2(current_count[temp_noun] + 1)

        image_nouns.append(dict(current_nouns))
    return image_nouns

In [82]:
%%time
train_image_nouns = generate_nouns("data/descriptions_train/",train_indice)
test_image_nouns = generate_nouns("data/descriptions_test/",test_indice)

total_count = generate_word_space()
reduced_word_space = dict((k, v) for k, v in total_count.items() if v>=3)
feature_name_list = reduced_word_space.keys()

CPU times: user 2min 15s, sys: 6.87 s, total: 2min 22s
Wall time: 2min 46s


In [83]:
len(reduced_word_space)

3138

In [84]:
## 
def generate_input_features(indice, image_nouns):
    image_nouns_features=[]
    for i in range(len(indice)):
        label=[0 for d in range(len(feature_name_list))]
        for index,k in enumerate(feature_name_list):
            if k in image_nouns[i]:
                label[index]=image_nouns[i][k]
        image_nouns_features.append(label)
    return image_nouns_features

In [85]:
train_image_nouns_features=generate_input_features(train_indice, train_image_nouns)
test_image_nouns_features=generate_input_features(test_indice, test_image_nouns)

In [88]:
# train model

ridge = Ridge(alpha=115)
ridge.fit(train_sorted_fc1000, np.matrix(train_image_nouns_features))
train_prediction = ridge.predict(train_sorted_fc1000)
test_prediction = ridge.predict(test_sorted_fc1000)

In [89]:
def closest_node(node, nodes):
    print(int(len(node)/100))
    closest_indice = np.argsort(distance.cdist(node, nodes))[:,:int(len(node)/100)]#[:,::-1]
    return closest_indice

In [90]:
def ordering_node_Pool5(node, nodes):
    print(int(len(node)/100))
    closest_indice = np.argsort(distance.cdist(node, nodes))
    return closest_indice

In [91]:
%%time
# train_match = closest_node(np.matrix(train_image_nouns_features), train_prediction)

test_match = closest_node(np.matrix(test_image_nouns_features), test_prediction)
train_match_rank_pool5 = ordering_node_Pool5(np.matrix(train_image_nouns_features), train_prediction)

20
100
CPU times: user 5min 59s, sys: 4.83 s, total: 6min 4s
Wall time: 6min 11s


In [None]:
# output to file

result = []
for i in range(2000):
    a = [str(i) + '.txt']
    b = ''
    for j in range(20):
        b += str(test_match[i][j]) + '.jpg '
    a.append(b)
    result.append(a)
    
np.savetxt('new.csv', result, delimiter = ',', fmt='%s', comments='', header='Descritpion_ID,Top_20_Image_IDs')
# list(map(lambda x: validate_indice[x],validate_match[0]))