In [311]:
## packages
import collections
import numpy as np
import textblob as tb
from textblob import TextBlob as TB
import math
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from scipy.spatial import distance
stopwords = set(stopwords.words('english'))

import pprint as pp
import pandas as pd

from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/megaandy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [198]:
# Regressors
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

In [49]:
##################  description - tag (start)  #################

In [199]:
## random split to 8000:2000 (train:validate)
training_indice = np.random.choice(10000, size=8000, replace=False)
validation_indice = [item for item in range(10000) if item not in training_indice]

## fix split to 8000:2000 (train:validate)
train_indice = range(10000)
# validate_indice = range(8000,10000)
test_indice = range(2000)

In [284]:
## translate all tag_file into a dictionary and collect them into an array
def tag_reform_collection(source,indice):
    tags_records = []
    for i in indice:
        current_dict = collections.defaultdict(list)
        path = source + str(i) +".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            current=line.split(":")[0]
            current_dict[current].append(line.split(":")[1])
        tags_records.append(dict(current_dict))
    return tags_records
        
train_tags_records = tag_reform_collection("data/tags_train/",train_indice)
# validate_tags_records = tag_reform_collection("data/tags_train/",validate_indice)
test_tags_records = tag_reform_collection("data/tags_test/",test_indice)

In [285]:
def cate_set(source,indice):
    cate_set = set([])
    for i in indice:
        path = source + str(i) +".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            cate_set.add(line.split(":")[0])
    return cate_set

train_cate_set = cate_set("data/tags_train/",train_indice)
# validate_cate_set = cate_set("data/tags_train/",validate_indice)
test_cate_set = cate_set("data/tags_test/",test_indice)
cate_set = train_cate_set

In [286]:
def cate_tags_dictionary(indice, tags_records):
    tags_dic = {}
    for cate in cate_set:
        tags=set()
        for i in range(len(indice)):
            if cate in tags_records[i]:
                tags=tags.union(tags_records[i][cate])
        tags=list(tags)
        tags.sort()
        tags_dic[cate] = tags
    return tags_dic

train_tags_dic = cate_tags_dictionary(train_indice, train_tags_records)
# validate_tags_dic = cate_tags_dictionary(validate_indice, validate_tags_records)
test_tags_dic = cate_tags_dictionary(test_indice, test_tags_records)

In [287]:
## train_tags_dic, validate_tags_dic, and test_tags_dic are the same!
tags_dic = train_tags_dic
# tags_dic

In [288]:
def build_labels(indice,tags_records):
    image_labels = dict([(key, []) for key in tags_dic])
    for i in range(len(indice)):
        for k in tags_dic.keys():
            label=[0 for d in range(len(tags_dic[k]))]
            if k in tags_records[i]:
                for index,elt in enumerate(tags_dic[k]):
                    if elt in tags_records[i][k]:
                        label[index]=1
            image_labels[k].append(label)
    return image_labels
            
train_image_labels = build_labels(train_indice,train_tags_records)
# validate_image_labels = build_labels(validate_indice,validate_tags_records)
test_image_labels = build_labels(test_indice,test_tags_records)

In [289]:
%%time
total_count = collections.defaultdict(int)

for i in train_indice:
    path = "data/descriptions_train/" + str(i) + ".txt"
    fo = open(path, "r")
    for line in fo.readlines():
        line = line.strip()
        ret = TB(line).lower()
        
        sentence_nouns=[]
        sentence_adjs=[]
        for word,pos in ret.tags:
            if (pos == 'NN' or pos == 'NNS') and word not in stopwords:
                sentence_nouns.append(tb.blob.Word(word).stem())
        
        for index in range(len(sentence_nouns)):
            total_count[sentence_nouns[index]] += 1

CPU times: user 31.1 s, sys: 737 ms, total: 31.9 s
Wall time: 31.9 s


In [290]:
# final feature set
reduced_nouns_dic = dict((k, v) for k, v in total_count.items() if v>=2)

In [291]:
len(reduced_nouns_dic)

2982

In [292]:
%%time
## -> noun : count
def generate_nouns(file_path, indice):
    image_nouns = []
    for i in indice:
        current_count = collections.defaultdict(int)
        current_nouns = collections.defaultdict(float)
        path = file_path + str(i) + ".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            ret = TB(line).lower()
            
            sentence_nouns=[]
            for word,pos in ret.tags:
                if (pos == 'NN' or pos == 'NNS') and word not in stopwords:
                    sentence_nouns.append(tb.blob.Word(word).stem())

            for index in range(len(sentence_nouns)):
                current_count[sentence_nouns[index]] = 1
                current_nouns[sentence_nouns[index]] = 1

        image_nouns.append(dict(current_nouns))
    return image_nouns

train_image_nouns = generate_nouns("data/descriptions_train/",train_indice)
# validate_image_nouns = generate_nouns("data/descriptions_train/",validate_indice)
test_image_nouns = generate_nouns("data/descriptions_test/",test_indice)



CPU times: user 35.7 s, sys: 834 ms, total: 36.5 s
Wall time: 36.5 s


In [293]:
## generate training data for each image (feature)
feature_name_list = reduced_nouns_dic.keys()
def generate_input_features(indice, image_nouns):
    image_nouns_features=[]
    for i in range(len(indice)):
        label=[0 for d in range(len(feature_name_list))]
        for index,k in enumerate(feature_name_list):
            if k in image_nouns[i]:
                label[index]=image_nouns[i][k]
        image_nouns_features.append(label)
    return image_nouns_features


train_image_nouns_features=generate_input_features(train_indice, train_image_nouns)
# validate_image_nouns_features=generate_input_features(validate_indice, validate_image_nouns)
test_image_nouns_features=generate_input_features(test_indice, test_image_nouns)



In [294]:
np.matrix(train_image_nouns_features).shape

(10000, 2982)

In [227]:
%%time

def train_RF():
    regressor_list=[]
    for cate in cate_set:
        print(cate)
        features = np.matrix(train_image_nouns_features)
        labels = np.matrix(train_image_labels[cate])

        regressor = RandomForestRegressor(n_estimators = 10)

        regressor.fit(features, labels)
        regressor_list.append(regressor)
    return regressor_list
        
regressor_list = train_RF()

vehicle
furniture
outdoor
indoor
food
accessory
kitchen
electronic
sports
appliance
person


  if sys.path[0] == '':


animal
CPU times: user 55min 28s, sys: 3.39 s, total: 55min 32s
Wall time: 55min 33s


In [283]:
regressor_list_Tag=regressor_list

In [295]:
def concatenate_labels(image_labels):
    final_labels=None
    for index,cate in enumerate(cate_set):
        temp_matrix = np.matrix(image_labels[cate])
        temp_matrix = temp_matrix
        if index == 0:
            final_labels=temp_matrix
        else:
            final_labels = np.hstack((final_labels,temp_matrix))
    return final_labels

train_final_labels=concatenate_labels(train_image_labels)
# validate_final_labels=concatenate_labels(validate_image_labels)
test_final_labels=concatenate_labels(test_image_labels)

In [296]:
def concatenate_outputs(indice, image_nouns_features, image_labels):
    prediction = None
    for index,cate in enumerate(cate_set):
        features = np.matrix(image_nouns_features)
        labels = np.matrix(image_labels[cate])

        temp_matrix = regressor_list[index].predict(features).reshape(len(indice),-1)
        temp_matrix = temp_matrix
        if index == 0:
            prediction = temp_matrix
        else:
            prediction = np.hstack((prediction,temp_matrix))
    return prediction
            
train_predict = concatenate_outputs(train_indice ,train_image_nouns_features, train_image_labels)
# validate_predict = concatenate_outputs(validate_indice, validate_image_nouns_features, validate_image_labels)
test_predict = concatenate_outputs(test_indice, test_image_nouns_features, test_image_labels)

In [298]:
# def closest_node(node, nodes):
#     print(int(len(node)/100))
#     closest_indice = np.argsort(distance.cdist(node, nodes))[:,:int(len(node)/100)]
#     return closest_indice

In [297]:
# def ordering_node(node, nodes):
#     ordered_indice = np.argsort(distance.cdist(node, nodes))#
#     return ordered_indice

In [299]:
def closest_node_cosine(node, nodes):
    closest_indice = np.argsort(-cosine_similarity(node, nodes))[:,:int(len(node)/100)]
    return closest_indice

In [316]:
def ordering_node_cosine(node, nodes):
    ordered_indice = np.argsort(-cosine_similarity(node, nodes))
    return ordered_indice

In [301]:
train_node = train_predict
train_nodes = train_final_labels

# validate_node = validate_predict
# validate_nodes = validate_final_labels

test_node = test_predict
test_nodes = test_final_labels

In [317]:
train_match_cosine = ordering_node_cosine(train_node, train_nodes)
test_match_cosine = closest_node_cosine(test_node, test_nodes)

In [305]:
result = []
for i in range(2000):
    a = [str(i) + '.txt']
    b = ''
    for j in range(20):
        b += str(test_match_cosine[i,j]) + '.jpg '
    a.append(b)
    result.append(a)

In [306]:
np.savetxt('new.csv', result, delimiter = ',', fmt='%s', comments='', header='Descritpion_ID,Top_20_Image_IDs')

In [None]:
##################  description - tag (end)  #################

In [None]:
##################  pool5 - description (start) #################

In [321]:
def reorder_pd(label, indice):
    new_label=label
    for i in range(indice):
        img=new_label.iloc[i][0]
        # find t, image #
        t=''
        for j in range(len(img)):
            if img[j]=='/':
                if img[j+4]=='.':
                    t+=img[j+1:j+4]
                elif img[j+3]=='.':
                    t+=img[j+1:j+3]
                elif img[j+5]=='.':
                    t+=img[j+1:j+5]
                elif img[j+2]=='.':
                    t+=img[j+1:j+2]
                else:
                    t+=img[j+1]
                
        new_label.set_value(i,0,int(t))
    return new_label.sort_values(by=[0])

In [322]:
# sort train_fc1000
train_fc1 = pd.DataFrame(pd.read_csv("data/features_train/features_resnet1000intermediate_train.csv", header=-1))
train_sorted_fc1=reorder_pd(train_fc1, 10000)
train_sorted_fc1000 = train_sorted_fc1.drop(0, 1).as_matrix()

  after removing the cwd from sys.path.


In [323]:
# sort test_fc1000
test_fc1 = pd.DataFrame(pd.read_csv("data/features_test/features_resnet1000intermediate_test.csv", header=-1))
test_sorted_fc1=reorder_pd(test_fc1, 2000)
test_sorted_fc1000 = test_sorted_fc1.drop(0, 1).as_matrix()

  after removing the cwd from sys.path.


In [324]:
# training, size of 10000
def generate_word_space():
    total_count = collections.defaultdict(int)
    for i in train_indice:
        path = "data/descriptions_train/" + str(i) + ".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            ret = TB(line).lower()

            sentence_words=[]
            for word,pos in ret.tags:
                if (pos in ['NN','NNS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']) and word not in stopwords:
                    sentence_words.append(tb.blob.Word(word).stem())

            for index in range(len(sentence_words)):
                total_count[sentence_words[index]] += 1
    return total_count

## -> noun : count
def generate_nouns(file_path, indice):
    image_nouns = []
    for i in indice:
#         current_count = collections.defaultdict(int)
        current_nouns = collections.defaultdict(float)
        path = file_path + str(i) + ".txt"
        fo = open(path, "r")
        for line in fo.readlines():
            line = line.strip()
            ret = TB(line).lower()
            
            sentence_nouns=[]
            for word,pos in ret.tags:
                if (['NN','NNS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']) and word not in stopwords:
                    sentence_nouns.append(tb.blob.Word(word).lemmatize())

            for index in range(len(sentence_nouns)):
                current_nouns[sentence_nouns[index]] = 1

        image_nouns.append(dict(current_nouns))
    return image_nouns

In [325]:
%%time
train_image_nouns = generate_nouns("data/descriptions_train/",train_indice)
test_image_nouns = generate_nouns("data/descriptions_test/",test_indice)

total_count = generate_word_space()
reduced_word_space = dict((k, v) for k, v in total_count.items() if v>=3)
feature_name_list = reduced_word_space.keys()

CPU times: user 1min 12s, sys: 1.85 s, total: 1min 14s
Wall time: 1min 14s


In [326]:
len(reduced_word_space)

3138

In [327]:
## 
def generate_input_features(indice, image_nouns):
    image_nouns_features=[]
    for i in range(len(indice)):
        label=[0 for d in range(len(feature_name_list))]
        for index,k in enumerate(feature_name_list):
            if k in image_nouns[i]:
                label[index]=image_nouns[i][k]
        image_nouns_features.append(label)
    return image_nouns_features

In [328]:
train_image_nouns_features=generate_input_features(train_indice, train_image_nouns)
test_image_nouns_features=generate_input_features(test_indice, test_image_nouns)

In [329]:
# train model

ridge = Ridge(alpha=115)
ridge.fit(train_sorted_fc1000, np.matrix(train_image_nouns_features))
train_prediction = ridge.predict(train_sorted_fc1000)
test_prediction = ridge.predict(test_sorted_fc1000)

In [330]:
def closest_node(node, nodes):
    print(int(len(node)/100))
    closest_indice = np.argsort(distance.cdist(node, nodes))[:,:int(len(node)/100)]#[:,::-1]
    return closest_indice

In [331]:
def ordering_node_Pool5(node, nodes):
    print(int(len(node)/100))
    closest_indice = np.argsort(distance.cdist(node, nodes))
    return closest_indice

In [332]:
%%time
# train_match = closest_node(np.matrix(train_image_nouns_features), train_prediction)

test_match_pool5 = closest_node(np.matrix(test_image_nouns_features), test_prediction)
train_match_rank_pool5 = ordering_node_Pool5(np.matrix(train_image_nouns_features), train_prediction)

20
100
CPU times: user 5min 36s, sys: 1.39 s, total: 5min 37s
Wall time: 5min 37s


In [None]:
result = []
for i in range(2000):
    a = [str(i) + '.txt']
    b = ''
    for j in range(20):
        b += str(test_match_pool5[i,j]) + '.jpg '
    a.append(b)
    result.append(a)

In [None]:
np.savetxt('new.csv', result, delimiter = ',', fmt='%s', comments='', header='Descritpion_ID,Top_20_Image_IDs')

In [None]:
##################  pool5 - description (end) #################

In [None]:
################## further try for combing two models (not finished) ####################

In [333]:
labels = [0 for d in range(10000)]
for i in range(10000):
    index1 = list(train_match_rank_tag[i]).index(i)
    index2 = list(train_match_rank_pool5[i]).index(i)
    if index1<index2 :
        labels[i] = 0
    else:
        labels[i] = 1

In [40]:
len([i for i in labels if i==0])

572

In [41]:
train_image_nouns_features=np.matrix(train_image_nouns_features)

In [42]:
logisticRegr = LogisticRegression()
logisticRegr.fit(train_image_nouns_features, labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
model_picking_prediction = logisticRegr.predict(test_image_nouns_features)

In [44]:
[i for i in model_picking_prediction if i==0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
for i in range(2000):
    test_image_nouns_features