In [9]:
import numpy as np
import glob
import os
import re
# nltk.download() if not installed nltk
from nltk.corpus import stopwords  # Stopwords: ‘the’, ‘is’, ‘are’...
from nltk.stem.porter import * # Stem: gamer, gaming, game -> game
from nltk.tokenize import RegexpTokenizer # Regexp: set rule to just tokenize word

### Load Feature

In [3]:
train_feature, test_feature = [], []

In [4]:
for line in open("./data/features_train/features_resnet1000_train.csv"):
    tokens = line.strip().split(",")
    train_feature.append(tokens)
train_feature = np.array(train_feature)

In [5]:
for line in open("./data/features_test/features_resnet1000_test.csv"):
    tokens = line.strip().split(",")
    test_feature.append(tokens)
test_feature = np.array(test_feature)

### Load Description

#### Global Dictionary

In [10]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

In [23]:
def create_dict(path):
    dictionary = {}
    
    ## glob.glob return paths matching given pattern
    for file in glob.glob(os.path.join(path, "*.txt")):
        with open(file, "r") as file_content:
            content = file_content.read()
            content = np.char.lower(content) # Lowercase
            content = re.sub('[^\w\s]', ' ', str(content)) # Define split
            
            for word in tokenizer.tokenize(content): # Remove punctuation
                try:
                    if word not in stop_words: # Remove stopwords
                        w = stemmer.stem(word) # Stem
                        if w in dictionary:
                            dictionary[w] += 1
                        else:
                            dictionary[w] = 1
                except:
                    pass 
    return dictionary

In [24]:
global_desc_dict = create_dict("./data/descriptions_train/")

In [25]:
global_desc_dict

{'teddi': 402,
 'bear': 960,
 'cloth': 141,
 'hang': 367,
 'line': 381,
 'outsid': 708,
 'window': 687,
 'stuf': 341,
 'toy': 204,
 'laundri': 5,
 'item': 245,
 'left': 75,
 'air': 615,
 'dri': 116,
 'pin': 21,
 'outdoor': 216,
 'cat': 1591,
 'ground': 388,
 'shoe': 128,
 'kitten': 77,
 'play': 1437,
 'lace': 3,
 'pair': 266,
 'blue': 1174,
 'shoelac': 2,
 'floor': 492,
 'grey': 143,
 'tabbi': 25,
 'navi': 12,
 'string': 36,
 'gray': 168,
 'tiger': 17,
 'walk': 1547,
 'across': 300,
 'brick': 299,
 'two': 4028,
 'women': 380,
 'togeth': 462,
 'near': 1684,
 'tree': 1190,
 'pretti': 76,
 'exot': 7,
 'dress': 327,
 'robe': 19,
 'orient': 17,
 'costum': 46,
 'onlook': 25,
 'nearbi': 80,
 'walkway': 38,
 'geisha': 1,
 'tourist': 23,
 'area': 810,
 'coupl': 837,
 'rocki': 50,
 'surfac': 104,
 'wooden': 697,
 'cut': 396,
 'board': 755,
 'knife': 169,
 'carrot': 215,
 'onion': 105,
 'assort': 108,
 'veget': 434,
 'made': 152,
 'interest': 13,
 'shape': 126,
 'brown': 727,
 'tabl': 2606,
 'blo

#### Bag Of Word

In [27]:
len(global_desc_dict)

6456

In [43]:
def create_bow_vector(file_path, global_dict, threshold):
    i = 0
    dict_thresh = {}
    for w in global_dict:
        if global_dict[w] >= threshold:
            dict_thresh[w] = i
            i = i + 1
    
    vec = {}
    for file_name in glob.glob(os.path.join(file_path, "*.txt")):
        with open(file_name, "r") as file_content:
            freq = [0] * len(dict_thresh)
            
            content = file_content.read()
            content = np.char.lower(content) # Lowercase
            content = re.sub('[^\w\s]', ' ', str(content)) # Define split
            
            for word in tokenizer.tokenize(content): # Remove punctuation
                try:
                    if word not in stop_words: # Remove stopwords
                        w = stemmer.stem(word) # Stem
                        if w in dict_thresh:
                            freq[dict_thresh[w]] += 1
                except:
                    pass
            vec[file_name.split('/')[-1]] = freq
    return vec, dict_thresh               

In [44]:
train_desc_bow, dict_thresh = create_bow_vector("./data/descriptions_train/", global_desc_dict, 20)

In [45]:
type(train_desc_bow)

dict

In [47]:
len(dict_thresh)

1163

In [46]:
train_desc_bow["1009.txt"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [42]:
len(train_desc_bow)

10000

### Load Tag

In [176]:
def create_dict_tags(path):
    tag_dict = {}
    for i in range(10000):
        filename = str(i) + ".txt"
        file = os.path.join(path, filename)
        with open(file, "r") as file_content:
            content = file_content.read()
            
            for line in content.splitlines():
                for idx,token in enumerate(line.split(':')):
                    if idx > 0 and token not in stop_words:
                        if token in tag_dict:
                            tag_dict[token] += 1
                        else:
                            tag_dict[token] = 1
    return tag_dict

In [177]:
tag_dict = create_dict_tags("./data/tags_train")

In [178]:
len(list_tags)

10000

In [188]:
s

[('hair drier', 12),
 ('toaster', 24),
 ('parking meter', 58),
 ('scissors', 73),
 ('toothbrush', 80),
 ('bear', 87),
 ('apple', 107),
 ('microwave', 113),
 ('donut', 118),
 ('hot dog', 122),
 ('sheep', 123),
 ('snowboard', 129),
 ('orange', 131),
 ('carrot', 139),
 ('stop sign', 142),
 ('fire hydrant', 152),
 ('mouse', 159),
 ('broccoli', 163),
 ('zebra', 169),
 ('keyboard', 170),
 ('banana', 170),
 ('cow', 178),
 ('elephant', 179),
 ('teddy bear', 184),
 ('suitcase', 189),
 ('baseball bat', 194),
 ('frisbee', 195),
 ('baseball glove', 199),
 ('kite', 201),
 ('refrigerator', 205),
 ('sandwich', 208),
 ('wine glass', 215),
 ('cake', 217),
 ('giraffe', 218),
 ('horse', 219),
 ('airplane', 222),
 ('skis', 238),
 ('remote', 240),
 ('oven', 247),
 ('skateboard', 257),
 ('pizza', 266),
 ('fork', 268),
 ('boat', 277),
 ('bicycle', 285),
 ('toilet', 288),
 ('spoon', 293),
 ('tennis racket', 294),
 ('laptop', 304),
 ('tie', 306),
 ('bird', 306),
 ('vase', 311),
 ('bed', 315),
 ('surfboard', 32

In [187]:
s[-10:]

[('bench', 476),
 ('truck', 499),
 ('bowl', 593),
 ('handbag', 598),
 ('bottle', 708),
 ('cup', 747),
 ('dining table', 960),
 ('car', 1022),
 ('chair', 1030),
 ('person', 5323)]

In [189]:
def getTop(dict,n):
    s = sorted(tag_dict.items(), key=lambda x:x[1])
    return s[-n:]

In [None]:
test = getTop()