In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA

import nltk
import string
import os
from collections import Counter

import tensorflow_hub as hub
import gensim

In [2]:
### Description for train data
desc_files = len(os.listdir('../descriptions_train'))
all_desc_train = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_train/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_train.append(empty_str)

In [3]:
### Tags for train data
tag_files = len(os.listdir('../tags_train'))
all_tags_train = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_train/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_train.append(nouns.replace('\n', ' '))

In [4]:
### Description for test data
desc_files = len(os.listdir('../descriptions_test'))
all_desc_test = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_test/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_test.append(empty_str)

In [5]:
### Tags for test data
tag_files = len(os.listdir('../tags_test'))
all_tags_test = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_test/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_test.append(nouns.replace('\n', ' '))

In [6]:
all_docs = []
all_docs.extend(all_desc_train)
all_docs.extend(all_desc_test)
all_docs.extend(all_tags_train)
all_docs.extend(all_tags_test)

In [7]:
train_1000 = pd.read_csv('../features_train/features_resnet1000_train.csv', header=None)
train_2048 = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv', header=None)
test_1000 = pd.read_csv('../features_test/features_resnet1000_test.csv', header=None)
test_2048 = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv', header=None)

In [8]:
def get_num(string):
    string = string.replace('.', ' ').replace('/', ' ')
    num = [int(s) for s in string.split() if s.isdigit()]
    return num[0]

def parse_to_numpy(pd):
    images_idx = []
    for string in pd[0]:
        images_idx.append(get_num(string))

    pd.insert(1, "Image_Index", images_idx, True)
    pd = pd.sort_values(by=['Image_Index'])
    pd = pd.reset_index(drop=True)
    del pd['Image_Index']
    del pd[0]
    np = pd.to_numpy()
    return np

In [9]:
train_1000 = parse_to_numpy(train_1000)
train_2048 = parse_to_numpy(train_2048)
test_1000 = parse_to_numpy(test_1000)
test_2048 = parse_to_numpy(test_2048)

In [10]:
### Google embedding
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
train_desc_tf = embed(all_desc_train).numpy()
test_desc_tf = embed(all_desc_test).numpy()
train_tags_tf = embed(all_tags_train).numpy()
test_tags_tf = embed(all_tags_test).numpy()

In [13]:
stop_words = set(nltk.corpus.stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
vectorizer.fit(all_desc_train+all_tags_train)

train_desc_bow = np.array(vectorizer.transform(all_desc_train).todense())
test_desc_bow = np.array(vectorizer.transform(all_desc_test).todense())
train_tags_bow = np.array(vectorizer.transform(all_tags_train).todense())
test_tags_bow = np.array(vectorizer.transform(all_tags_test).todense())

In [43]:
## pca bow
pca = PCA(n_components=700)
pca.fit(np.vstack([train_desc_bow,test_desc_bow,train_tags_bow,test_tags_bow]))
train_desc_bow_pca = pca.transform(train_desc_bow)
test_desc_bow_pca = pca.transform(test_desc_bow)
train_tags_bow_pca = pca.transform(train_tags_bow)
test_tags_bow_pca = pca.transform(test_tags_bow)

In [16]:
# word2vec
embed = hub.load("https://tfhub.dev/google/Wiki-words-500/2")

In [17]:
punct = set(string.punctuation)
def process_doc(doc):
    doc = doc.lower()
    doc = ''.join(c for c in doc if c not in punct)
    doc = doc.split()
    doc = [word for word in doc if word not in stop_words]
    return doc

gensim_docs = [process_doc(doc) for doc in all_docs]
w2v_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=1)
w2v_vectorizer.fit([' '.join(doc) for doc in gensim_docs]);

In [20]:
word2vec_train_desc = np.zeros((10000,500))
for i in range(10000):
    with open(f'../descriptions_train/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_train_desc[i] += weight*np.ravel(embed([word]))
            total += weight
        word2vec_train_desc[i] /= total

word2vec_test_desc = np.zeros((2000,500))
for i in range(2000):
    with open(f'../descriptions_test/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_test_desc[i] += weight*np.ravel(embed([word]))
            total += weight
        word2vec_test_desc[i] /= total

In [23]:
word2vec_train_tags = np.zeros((10000,500))
for i in range(10000):
    with open(f'../tags_train/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_train_tags[i] += weight*np.ravel(embed([word]))
            total += weight
        if total!=0:
            word2vec_train_tags[i] /= total

word2vec_test_tags = np.zeros((2000,500))
for i in range(2000):
    with open(f'../tags_test/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_test_tags[i] += weight*np.ravel(embed([word]))
            total += weight
        if total!=0:
            word2vec_test_tags[i] /= total

In [24]:
def get_prediction_cos(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.T.argsort(1)
def get_prediction_euc(vecs,pics):
    dists = pairwise_distances(vecs,pics)
    return dists.T.argsort(1)
def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])
def map_20_2(ranks):
    return np.mean([1/(1+rank) if rank<20 else 0 for rank in ranks])
def evaluate(vectors,label_vectors):
    preds1 = get_prediction_cos(vectors,label_vectors)
    ranks1 = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds1)]
    preds2 = get_prediction_euc(vectors,label_vectors)
    ranks2 = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds2)]
    return np.mean(ranks1),np.mean(ranks2),map_20(ranks1),map_20(ranks2),map_20_2(ranks1),map_20_2(ranks2)
def get_top_20(descr_id):
    return preds[descr_id][:20]
def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)

TF Embeddings + TFIDF

In [25]:
train_desc = np.hstack((train_desc_tf, train_desc_bow))
test_desc = np.hstack((test_desc_tf, test_desc_bow))

train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow))

In [26]:
train_desc.shape, train_pic.shape

((10000, 5978), (10000, 6978))

In [27]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(4.958, 5.003, 0.8067000000000001, 0.81235, 0.5458061875624376, 0.5554347617080202)
(4.63, 4.3325, 0.8069, 0.81825, 0.5504180882348645, 0.556622007876884)
(5.12, 5.0345, 0.799225, 0.8049, 0.555200701010376, 0.5608859265476526)
(4.3825, 4.229, 0.8147249999999999, 0.8221, 0.551081020881853, 0.5552122167113073)
(4.858, 4.764, 0.803575, 0.80675, 0.554460956346715, 0.5553660265624468)


TF Embeddings + TFIDF PCAd

In [28]:
train_desc = np.hstack((train_desc_tf, train_desc_bow_pca))
test_desc = np.hstack((test_desc_tf, test_desc_bow_pca))

train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow_pca))

In [30]:
train_desc.shape, train_pic.shape

((10000, 1212), (10000, 2212))

In [29]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(4.794, 4.85, 0.8108249999999999, 0.81535, 0.5513848795640204, 0.5568980977871456)
(4.482, 4.2205, 0.8118249999999999, 0.821175, 0.5575023814206089, 0.5641014727038528)
(5.005, 4.939, 0.8029749999999999, 0.807825, 0.5587217228999931, 0.5648855124579583)
(4.285, 4.171, 0.8172999999999999, 0.8240250000000001, 0.5522328608912587, 0.5577769350102941)
(4.7755, 4.7175, 0.8072, 0.809, 0.558855467859454, 0.5576404582990109)


TF Embeddings + Word2Vec

In [31]:
train_desc = np.hstack((train_desc_tf, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, word2vec_test_desc))

train_pic = np.hstack((train_1000, train_tags_tf, word2vec_train_tags))
test_pic = np.hstack((test_1000, test_tags_tf, word2vec_test_tags))

In [32]:
train_desc.shape, train_pic.shape

((10000, 1012), (10000, 2012))

In [33]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(5.1355, 5.0555, 0.79675, 0.7998999999999998, 0.5430441106859257, 0.5435238922197514)
(4.7865, 4.5125, 0.8018, 0.8072, 0.5528397467320262, 0.5594502346037175)
(5.1275, 4.9315, 0.8004, 0.803825, 0.5506896367972413, 0.5569001756542391)
(4.464, 4.235, 0.8119, 0.8175249999999998, 0.5522892924842615, 0.5581437879887571)
(4.948, 4.8145, 0.80415, 0.80745, 0.5489755291427698, 0.5480194601408224)


Everything Bagel with 700 PCA

In [34]:
train_desc = np.hstack((train_desc_tf, train_desc_bow_pca, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, test_desc_bow_pca, word2vec_test_desc))

train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca, word2vec_train_tags))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow_pca, word2vec_test_tags))

In [35]:
train_desc.shape, train_pic.shape

((10000, 1712), (10000, 2712))

In [36]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(4.671, 4.6505, 0.8122750000000001, 0.81655, 0.5555129951189163, 0.5615943896810963)
(4.3545, 4.1025, 0.8169500000000001, 0.8251499999999999, 0.5648822333024732, 0.5688281261072089)
(4.784, 4.7145, 0.8095249999999998, 0.814175, 0.5675986218159206, 0.5728940486435649)
(4.1455, 4.0895, 0.822075, 0.827275, 0.5590299921226004, 0.5624668474155703)
(4.598, 4.551, 0.8126499999999999, 0.814575, 0.5640659474877547, 0.564204467582641)


In [37]:
rcv.alpha_

15.36842105263158

Everything Bagel with 500 PCA

In [40]:
train_desc = np.hstack((train_desc_tf, train_desc_bow_pca, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, test_desc_bow_pca, word2vec_test_desc))

train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca, word2vec_train_tags))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow_pca, word2vec_test_tags))

In [41]:
train_desc.shape, train_pic.shape

((10000, 1512), (10000, 2512))

In [42]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(4.6755, 4.654, 0.81235, 0.81615, 0.5548479792202122, 0.5618349306674577)
(4.381, 4.157, 0.8161499999999999, 0.824275, 0.5614261666180396, 0.5675478446239863)
(4.8125, 4.7415, 0.8095, 0.8138, 0.5660363064137445, 0.5724699297950071)
(4.152, 4.1055, 0.82155, 0.826825, 0.5588742560372204, 0.562250288361879)
(4.616, 4.5915, 0.8121499999999999, 0.813025, 0.5618326458113609, 0.5600093713905633)


 Back to 700 for the submission

In [44]:
train_desc = np.hstack((train_desc_tf, train_desc_bow_pca, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, test_desc_bow_pca, word2vec_test_desc))

train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca, word2vec_train_tags))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow_pca, word2vec_test_tags))

In [45]:
train_desc.shape, test_desc.shape, train_pic.shape, test_pic.shape

((10000, 1712), (2000, 1712), (10000, 2712), (2000, 2712))

In [47]:
reg = RidgeCV(alphas=np.linspace(1,40,20))
reg.fit(train_pic, train_desc)
print("best reg:",reg.alpha_)
prediction = reg.predict(test_pic)
preds = get_prediction_euc(prediction, test_desc)
save_submission()

best reg: 15.36842105263158
