In [20]:
import numpy as np
import glob
import os
import re
import nltk
import time
import csv
from nltk.corpus import stopwords  # Stopwords: ‘the’, ‘is’, ‘are’...
from nltk.stem.porter import * # Stem: gamer, gaming, game -> game
from nltk.tokenize import RegexpTokenizer # Regexp: set rule to just tokenize word

### Load Feature

In [3]:
train_feature, test_feature = [], []

In [4]:
for line in open("./data/features_train/features_resnet1000_train.csv"):
    tokens = line.strip().split(",")
    train_feature.append(tokens)
train_feature = np.array(train_feature)

In [5]:
for line in open("./data/features_test/features_resnet1000_test.csv"):
    tokens = line.strip().split(",")
    test_feature.append(tokens)
test_feature = np.array(test_feature)

### Load Description

#### Global Dictionary

In [6]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

In [7]:
def create_dict(path):
    dictionary = {}
    
    ## glob.glob return paths matching given pattern
    for file in glob.glob(os.path.join(path, "*.txt")):
        with open(file, "r") as file_content:
            content = file_content.read()
            content = np.char.lower(content) # Lowercase
            content = re.sub('[^\w\s]', ' ', str(content)) # Define split
            
            for word in tokenizer.tokenize(content): # Remove punctuation
                try:
                    if word not in stop_words: # Remove stopwords
                        w = stemmer.stem(word) # Stem
                        if w in dictionary:
                            dictionary[w] += 1
                        else:
                            dictionary[w] = 1
                except:
                    pass 
    return dictionary

In [8]:
global_desc_dict = create_dict("./data/descriptions_train/")

#### Bag Of Word

In [11]:
def create_bow_vector(file_path, global_dict, threshold):
    i = 0
    dict_thresh = {}
    for w in global_dict:
        if global_dict[w] >= threshold:
            dict_thresh[w] = i
            i = i + 1
    
    vec = {}
    for file_name in glob.glob(os.path.join(file_path, "*.txt")):
        with open(file_name, "r") as file_content:
            freq = [0] * len(dict_thresh)
            
            content = file_content.read()
            content = np.char.lower(content) # Lowercase
            content = re.sub('[^\w\s]', ' ', str(content)) # Define split
            
            for word in tokenizer.tokenize(content): # Remove punctuation
                try:
                    if word not in stop_words: # Remove stopwords
                        w = stemmer.stem(word) # Stem
                        if w in dict_thresh:
                            freq[dict_thresh[w]] += 1
                except:
                    pass
            vec[file_name.split('/')[-1]] = freq
    return vec, dict_thresh               

In [32]:
train_desc_bow, dict_thresh = create_bow_vector("./data/descriptions_train/", global_desc_dict, 20)

### Load Tags

In [62]:
def create_dict_tags(path):
    tag_dict = {}
    for i in range(10000):
        filename = str(i) + ".txt"
        file = os.path.join(path, filename)
        with open(file, "r") as file_content:
            content = file_content.read()
            
            for line in content.splitlines():
                for idx,token in enumerate(line.split(':')):
                    if idx > 0 and token not in stop_words:
                        if token in tag_dict:
                            tag_dict[token] += 1
                        else:
                            tag_dict[token] = 1
    return tag_dict

### Mapping Description BoW to Tags

In [18]:
tag_dict = ['car', 'people', 'table', 'shit']

In [85]:
def mapping_to_vector(tag_dict, word_set):
    tags_vector = np.zeros(len(tag_dict), dtype=np.int)
    for word in word_set:
        if word in tag_dict:
            tags_vector[tag_dict.index(word)] = 1
    return tags_vector

In [61]:
def get_top20_indexs(similarity_array):
    sorted_indexs = np.argsort(similarity_array)
    return sorted_indexs[:20]

### Save Output

In [None]:
def get_timestamp():
    millis = int(round(time.time() * 1000))
    return millis

def create_submission_csv(predictions):
    filename = 'submission' + '_' + str(get_timestamp()) + '.csv'
    with open(filename, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['Descritpion_ID', 'Top_20_Image_IDs'])
        for i in range (0, len(predictions)):
            description_id = str(i) + ".txt"
            top20_image_ids = " ".join(str(index) + '.jpg' for index in predictions[i])
            writer.writerow([description_id, top20_image_ids])

---
## MAIN FUNCTION

In [53]:
def main(file_path):
    for file_name in glob.glob(os.path.join(file_path, "*.txt")):
        with open(file_name, "r") as file_content:
            desc_BoW = train_desc_bow[file_name.split('/')[-1]]
            tags_vector = mapping_to_tags(tag_dict, desc_BoW)

In [43]:
main('./data/descriptions_test/')

In [82]:
def get_candidate_tags(path):
    vectors = []
    for i in range(10000):
        vector = []
        
        filename = str(i) + ".txt"
        file = os.path.join(path, filename)
        with open(file, "r") as file_content:
            content = file_content.read()
            
            for line in content.splitlines():
                for idx,token in enumerate(line.split(':')):
                    if idx > 0:
                        vector.append(token)
        vectors.append(vector)
    return vectors

In [90]:
def build_candidate_vectors(tag_dict, path):
    candidate_tags = get_candidate_tags(path)
    candidate_vectors = []
    
    for tags in candidate_tags:
        candidate_vectors.append(mapping_to_vector(tag_dict, tags))
    return candidate_vectors

In [None]:
# def build_input_vector

In [88]:
tag_dict = create_dict_tags("./data/tags_train")

In [94]:
dic = []

type(tag_dict.keys())

dict_keys

In [91]:
vectors = build_candidate_vectors(tag_dict, "./data/descriptions_train")

In [92]:
print(vectors)

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 