In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import pairwise_distances

import nltk
import string
import os
from collections import Counter

import tensorflow_hub as hub
import gensim

In [2]:
### Description for train data
desc_files = len(os.listdir('../descriptions_train'))
all_desc_train = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_train/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_train.append(empty_str)

In [3]:
### Tags for train data
tag_files = len(os.listdir('../tags_train'))
all_tags_train = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_train/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_train.append(nouns.replace('\n', ' '))

In [4]:
### Description for test data
desc_files = len(os.listdir('../descriptions_test'))
all_desc_test = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_test/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_test.append(empty_str)

In [5]:
### Tags for test data
tag_files = len(os.listdir('../tags_test'))
all_tags_test = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_test/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_test.append(nouns.replace('\n', ' '))

In [6]:
all_docs = []
all_docs.extend(all_desc_train)
all_docs.extend(all_desc_test)
all_docs.extend(all_tags_train)
all_docs.extend(all_tags_test)

In [7]:
train_1000 = pd.read_csv('../features_train/features_resnet1000_train.csv', header=None)
train_2048 = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv', header=None)
test_1000 = pd.read_csv('../features_test/features_resnet1000_test.csv', header=None)
test_2048 = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv', header=None)

In [8]:
def get_num(string):
    string = string.replace('.', ' ').replace('/', ' ')
    num = [int(s) for s in string.split() if s.isdigit()]
    return num[0]

def parse_to_numpy(pd):
    images_idx = []
    for string in pd[0]:
        images_idx.append(get_num(string))

    pd.insert(1, "Image_Index", images_idx, True)
    pd = pd.sort_values(by=['Image_Index'])
    pd = pd.reset_index(drop=True)
    del pd['Image_Index']
    del pd[0]
    np = pd.to_numpy()
    return np

In [9]:
train_1000 = parse_to_numpy(train_1000)
train_2048 = parse_to_numpy(train_2048)
test_1000 = parse_to_numpy(test_1000)
test_2048 = parse_to_numpy(test_2048)

In [10]:
### Google embedding
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
train_desc_tf = embed(all_desc_train).numpy()
test_desc_tf = embed(all_desc_test).numpy()
train_tags_tf = embed(all_tags_train).numpy()
test_tags_tf = embed(all_tags_test).numpy()

In [11]:
stop_words = set(nltk.corpus.stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
vectorizer.fit(all_desc_train+all_tags_train)

train_desc_bow = np.array(vectorizer.transform(all_desc_train).todense())
test_desc_bow = np.array(vectorizer.transform(all_desc_test).todense())
train_tags_bow = np.array(vectorizer.transform(all_tags_train).todense())
test_tags_bow = np.array(vectorizer.transform(all_tags_test).todense())

In [12]:
# word2vec
model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

In [13]:
model.most_similar(positive=['business'])

[('businesses', 0.6623775362968445),
 ('busines', 0.6080313324928284),
 ('busi_ness', 0.5612965226173401),
 ('PETER_PASSI_covers', 0.5530025959014893),
 ('Business', 0.5466139316558838),
 ('businesss', 0.5441080331802368),
 ('Sopris_supplemental_solutions', 0.5252544283866882),
 ('company', 0.5192004442214966),
 ('entrepreneurial', 0.5077816247940063),
 ('buiness', 0.5039401650428772)]

In [14]:
model.most_similar(positive=['dogs'])

[('dog', 0.8680489659309387),
 ('canines', 0.8181710839271545),
 ('cats', 0.76517653465271),
 ('pit_bulls', 0.7548302412033081),
 ('pets', 0.7424418330192566),
 ('puppies', 0.7385991811752319),
 ('pooches', 0.7162366509437561),
 ('German_shepherds', 0.7071062922477722),
 ('animals', 0.6985694169998169),
 ('pit_bull', 0.6983613967895508)]

In [15]:
punct = set(string.punctuation)
def process_doc(doc):
    doc = doc.lower()
    doc = ''.join(c for c in doc if c not in punct)
    doc = doc.split()
    doc = [word for word in doc if word not in stop_words]
    return doc

gensim_docs = [process_doc(doc) for doc in all_docs]
w2v_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=1)
w2v_vectorizer.fit([' '.join(doc) for doc in gensim_docs]);

In [16]:
word2vec_train_desc = np.zeros((10000,300))
for i in range(10000):
    with open(f'../descriptions_train/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in model:
                continue
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_train_desc[i] += weight*model[word]
            total += weight
        word2vec_train_desc[i] /= total

word2vec_test_desc = np.zeros((2000,300))
for i in range(2000):
    with open(f'../descriptions_test/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in model:
                continue
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_test_desc[i] += weight*model[word]
            total += weight
        word2vec_test_desc[i] /= total

In [17]:
word2vec_train_tags = np.zeros((10000,300))
for i in range(10000):
    with open(f'../tags_train/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in model:
                continue
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_train_tags[i] += weight*model[word]
            total += weight
        if total!=0:
            word2vec_train_tags[i] /= total

word2vec_test_tags = np.zeros((2000,300))
for i in range(2000):
    with open(f'../tags_test/{i}.txt') as f:
        text = f.read()
        words = process_doc(text)
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in model:
                continue
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_test_tags[i] += weight*model[word]
            total += weight
        if total!=0:
            word2vec_test_tags[i] /= total

In [18]:
def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.T.argsort(1)
def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])
def map_20_2(ranks):
    return np.mean([1/(1+rank) if rank<20 else 0 for rank in ranks])
def evaluate(vectors,label_vectors):
    preds = get_prediction(vectors,label_vectors)
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean(ranks),map_20(ranks),map_20_2(ranks)
def get_top_20(descr_id):
    return preds[descr_id][:20]
def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)

TF Embeddings + TFIDF

In [19]:
train_desc = np.hstack((train_desc_tf, train_desc_bow))
test_desc = np.hstack((test_desc_tf, test_desc_bow))

train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow))

In [20]:
train_desc.shape, train_pic.shape

((10000, 5978), (10000, 6978))

In [21]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(4.958, 0.8067000000000001, 0.5458061875624376)
(4.63, 0.8069, 0.5504180882348645)
(5.12, 0.799225, 0.555200701010376)
(4.3825, 0.8147249999999999, 0.551081020881853)
(4.858, 0.803575, 0.554460956346715)


TF Embeddings + Word2Vec

In [22]:
train_desc = np.hstack((train_desc_tf, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, word2vec_test_desc))

train_pic = np.hstack((train_1000, train_tags_tf, word2vec_train_tags))
test_pic = np.hstack((test_1000, test_tags_tf, word2vec_test_tags))

In [23]:
train_desc.shape, train_pic.shape

((10000, 812), (10000, 1812))

In [24]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(5.0055, 0.80035, 0.5467757744942536)
(4.765, 0.8067000000000001, 0.5548446938961451)
(4.9585, 0.80355, 0.5544075930669776)
(4.4525, 0.814, 0.5525422264590415)
(4.8345, 0.806575, 0.5593533226212212)


Take TF Embeddings for Tags out

In [28]:
train_desc = np.hstack((train_desc_tf, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, word2vec_test_desc))

train_pic = np.hstack((train_1000, word2vec_train_tags))
test_pic = np.hstack((test_1000, word2vec_test_tags))

In [29]:
train_desc.shape, train_pic.shape

((10000, 812), (10000, 1300))

In [30]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(11.4715, 0.6714, 0.4094239396933476)
(11.0405, 0.684125, 0.4217868812968078)
(10.7035, 0.6812999999999999, 0.42777061354967705)
(10.64, 0.68385, 0.4121403403807235)
(11.044, 0.67745, 0.41653187822454457)


Everything Bagel

In [34]:
train_desc = np.hstack((train_desc_tf, train_desc_bow, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, test_desc_bow, word2vec_test_desc))

train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow, word2vec_train_tags))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow, word2vec_test_tags))

In [35]:
train_desc.shape, train_pic.shape

((10000, 6278), (10000, 7278))

In [27]:
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(4.7705, 0.808175, 0.5553373424541574)
(4.483, 0.816, 0.5670722440463732)
(4.764, 0.8096749999999999, 0.5668929112124544)
(4.34, 0.81875, 0.5615508883230632)
(4.6065, 0.8129, 0.5650779923658212)


In [36]:
train_desc.shape, test_desc.shape, train_pic.shape, test_pic.shape

((10000, 6278), (2000, 6278), (10000, 7278), (2000, 7278))

In [38]:
rcv.fit(train_desc, train_pic)
print('best reg:',rcv.alpha_)
prediction = rcv.predict(test_desc)
preds = get_prediction(prediction, test_pic)
save_submission()

best reg: 7.157894736842106
