In [5]:
import numpy as np
from numpy import *
import time
import csv
import glob
import os
import re
# nltk.download() if not installed nltk
from nltk.corpus import stopwords  # Stopwords: ‘the’, ‘is’, ‘are’...
from nltk.stem.porter import * # Stem: gamer, gaming, game -> game
from nltk.tokenize import RegexpTokenizer # Regexp: set rule to just tokenize word

### Load Feature

In [6]:
train_feat, test_feat = [], []

In [7]:
for line in open("./data/features_train/features_resnet1000_train.csv"):
    l = line.strip().split(",")
    train_feat.append(l)
train_feat = np.array(train_feat)
for line in open("./data/features_test/features_resnet1000_test.csv"):
    l = line.strip().split(",")
    test_feat.append(l)
test_feat = np.array(test_feat)

### Load Description

#### Global Dictionary

In [8]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

In [9]:
def create_dict(path):
    dictionary = {}
    
    ## glob.glob return paths matching given pattern
    for i in range(10000):
        file_name = str(i) + ".txt"
        file = os.path.join(path, file_name)
        with open(file, "r") as file_content:
            content = file_content.read()
            content = np.char.lower(content) # Lowercase
            content = re.sub('[^\w\s]', ' ', str(content)) # Define split
            
            for word in tokenizer.tokenize(content): # Remove punctuation
                try:
                    if word not in stop_words: # Remove stopwords
                        w = stemmer.stem(word) # Stem
                        if w in dictionary:
                            dictionary[w] += 1
                        else:
                            dictionary[w] = 1
                except:
                    pass 
    return dictionary

In [10]:
global_desc_dict = create_dict("./data/descriptions_train/")

#### Bag Of Word

In [15]:
def create_desc_bow(file_path, global_dict, threshold, file_count):
    dict_thresh = {}   
    i = 0
    for w in global_dict:
        if global_dict[w] >= threshold:
            dict_thresh[w] = i
            i = i + 1
    
    dict_freq = {}
    for i in range(file_count):
        file_name = str(i) + ".txt"
        file = os.path.join(file_path, file_name)
        with open(file, "r") as content_file:
            content = content_file.read()
            content = np.char.lower(content)
            content = re.sub('[^\w\s]', ' ', str(content))
            stemmer = PorterStemmer()
            
            cur = [0] * len(dict_thresh)
            for word in content.split():
                try:
                    if word not in stopwords.words("english"):
                        w = stemmer.stem(word)
                        if w in dict_thresh:
                            cur[dict_thresh[w]] += 1
                except:
                    pass
            dict_freq[file_name.split('/')[-1]] = cur
    return dict_freq

In [16]:
train_desc_dict_freq = create_desc_bow("./data/descriptions_train/", global_desc_dict, 20, 10000)

In [None]:
test_desc_dict_freq = create_desc_bow("./data/descriptions_test/", global_desc_dict, 20, 10000)

---
## KNN with description to feature

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import csv
import time

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

#### Fit Knn with description

In [None]:
X = []
y = []
for k in train_desc_dict_freq:
    X.append(train_desc_dict_freq[k])
    y.append(k)
X = np.array(X)
y = np.array(y)
knn.fit(X, y)

In [None]:
train_feat_dict = {}
for i in range(len(train_feat)):
    name = int(train_feat[i][0].split("/")[1].split(".")[0])
    train_feat_dict[name] = np.array(train_feat[i][1:], dtype=float)

def norm_1(feat1, feat2):
    feat1 = np.array(feat1, dtype=float)
    feat2 = np.array(feat2, dtype=float)
    return np.linalg.norm(feat1 - feat2)

def candidate_images(target_feat, feats):
    feat_score = {}
    for f in feats:
        name = f[0,].split("/")[1]
        score = norm_1(target_feat, f[1:])
        feat_score[name] = score
    sorted_feat_score = sorted(feat_score.items(), key=operator.itemgetter(1))
    res = []
    for i in range(20):
        res.append(sorted_feat_score[i][0])
    res = np.array(res)
    return res;

In [None]:
def predict(filename):
    d = np.array(test_desc_dict_freq[filename])
    pred = knn.predict(d.reshape(1, -1))
    pred_value = int(pred[0].split(".")[0])
    agg = train_feat_dict[pred_value]
    candidates = candidate_images(agg, test_feat)
    return candidates

#### Create Submission

In [None]:
def get_timestamp():
    millis = int(round(time.time() * 1000))
    return millis

def create_submission_csv():
    submission = []
    submission.append(["Descritpion_ID", "Top_20_Image_IDs"])
    count = 0
    for d in test_desc_dict_freq:
        line = []
        line.append(d)
        candidates = " ".join(predict(d))
        line.append(candidates)
        submission.append(line)
        count += 1
    filename = 'submission' + '_' + str(get_timestamp()) + '.csv'
    with open(filename, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(submission)

In [None]:
create_submission_csv()