In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
import nltk
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.metrics import pairwise_distances
import tensorflow_hub as hub
import tensorflow as tf
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [None]:
### Description for train data
desc_files = len(os.listdir('../descriptions_train'))
all_desc_train = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_train/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_train.append(empty_str)

In [None]:
### Tags for train data
tag_files = len(os.listdir('../tags_train'))
all_tags_train = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_train/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_train.append(nouns.replace('\n', ' '))

In [None]:
### Description for test data
desc_files = len(os.listdir('../descriptions_test'))
all_desc_test = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_test/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_test.append(empty_str)

In [None]:
### Tags for test data
tag_files = len(os.listdir('../tags_test'))
all_tags_test = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_test/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_test.append(nouns.replace('\n', ' '))

In [None]:
all_docs = []
all_docs.extend(all_desc_train)
all_docs.extend(all_desc_test)
all_docs.extend(all_tags_train)
all_docs.extend(all_tags_test)

In [None]:
stop_words = set(nltk.corpus.stopwords.words("english"))
vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2)
vectorizer.fit(all_docs);

In [None]:
train_desc = vectorizer.transform(all_desc_train)
test_desc = vectorizer.transform(all_desc_test)
train_tags = vectorizer.transform(all_tags_train)
test_tags = vectorizer.transform(all_tags_test)

In [None]:
train_desc = np.array(train_desc.todense())
test_desc = np.array(test_desc.todense())
train_tags = np.array(train_tags.todense())
test_tags = np.array(test_tags.todense())

In [None]:
train_1000 = pd.read_csv('../features_train/features_resnet1000_train.csv', header=None)
train_2048 = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv', header=None)
test_1000 = pd.read_csv('../features_test/features_resnet1000_test.csv', header=None)
test_2048 = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv', header=None)

In [None]:
def get_num(string):
    string = string.replace('.', ' ').replace('/', ' ')
    num = [int(s) for s in string.split() if s.isdigit()]
    return num[0]

def parse_to_numpy(pd):
    images_idx = []
    for string in pd[0]:
        images_idx.append(get_num(string))

    pd.insert(1, "Image_Index", images_idx, True)
    pd = pd.sort_values(by=['Image_Index'])
    pd = pd.reset_index(drop=True)
    del pd['Image_Index']
    del pd[0]
    np = pd.to_numpy()
    return np

In [None]:
train_1000 = parse_to_numpy(train_1000)
train_2048 = parse_to_numpy(train_2048)
test_1000 = parse_to_numpy(test_1000)
test_2048 = parse_to_numpy(test_2048)

In [None]:
train_pic = np.hstack((train_1000, train_2048))#, train_tags))
test_pic = np.hstack((test_1000, test_2048))#, test_tags))

In [14]:
### Google embedding
#tf.enable_eager_execution()
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
### Overwrite old train_desc and test_desc
train_desc_tf = embed(all_desc_train).numpy()
test_desc_tf = embed(all_desc_test).numpy()

In [15]:
# word2vec
# path = get_tmpfile("word2vec.model")
# model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
# model.save("word2vec.model")
# model = Word2Vec.load("word2vec.model")
# model.train([all_desc_train], total_examples=1, epochs=1)

In [16]:
train_desc.shape, train_pic.shape, test_desc.shape, test_pic.shape

((10000, 5931), (10000, 3048), (2000, 5931), (2000, 3048))

In [None]:
### Set PCA Dimensions
pca = PCA(n_components = 100)
pca.fit(train_pic)
train_pic = pca.transform(train_pic)
test_pic = pca.transform(test_pic)

In [None]:
train_desc.shape, train_pic.shape, test_desc.shape, test_pic.shape

In [None]:
def get_prediction(vecs,pics):
    assert vecs[0].shape[0]==pics.shape[0]
    dists = np.zeros((pics.shape[0],pics.shape[0]))
    for vec in vecs:
        dists += pairwise_distances(vec,pics,metric='cosine')
    dists = dists/len(vecs)
    return dists.argsort(1)
def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])
def evaluate(vectors,label_vectors):
    preds = get_prediction(vectors,label_vectors)
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean(ranks),map_20(ranks)
def get_top_20(descr_id):
    return preds[descr_id][:20]
def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)

In [22]:
kf = KFold(n_splits=5)
regr = Ridge()

for train_index, test_index in kf.split(train_pic):
    regr.fit(train_desc[train_index], train_pic[train_index])
    pred = regr.predict(train_desc[test_index])
    output = evaluate([pred], train_pic[test_index])
    print(output)

(34.9365, 0.506475)
(36.5815, 0.5003749999999999)
(35.4395, 0.495025)
(33.8835, 0.48664999999999997)
(36.488, 0.4963)


In [23]:
kf = KFold(n_splits=5)
regr = Ridge()

for train_index, test_index in kf.split(train_pic):
    regr.fit(train_desc[train_index], train_pic[train_index])
    pred = regr.predict(train_desc[test_index])
    regr.fit(train_desc_tf[train_index], train_pic[train_index])
    pred2 = regr.predict(train_desc_tf[test_index])
    output = evaluate([pred,pred2], train_pic[test_index])
    print(output)

(36.097, 0.48264999999999997)
(36.131, 0.48297500000000004)
(35.9415, 0.480125)
(34.4185, 0.4703)
(37.097, 0.47382500000000005)


In [24]:
kf = KFold(n_splits=5)
regr = Ridge()
train_desc = np.hstack([train_desc,train_desc_tf])
for train_index, test_index in kf.split(train_pic):
    regr.fit(train_desc[train_index], train_pic[train_index])
    pred = regr.predict(train_desc[test_index])
    output = evaluate([pred], train_pic[test_index])
    print(output)

(33.367, 0.51175)
(33.469, 0.51195)
(33.8685, 0.5110750000000001)
(31.6135, 0.49965)
(34.629, 0.506875)


In [25]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(interaction_only=True,include_bias=False)
train_desc_b = poly.fit_transform(train_desc)

In [None]:
train_desc.shape

In [None]:
train_desc_b.shape

In [21]:
# regr.fit(train_desc, train_pic)
# prediction = regr.predict(test_desc)
# preds = get_prediction(prediction, test_pic)
# save_submission()

In [20]:
from sklearn.pipeline import Pipeline

In [21]:
pipeline = Pipeline([
('tfidf',TfidfVectorizer(stop_words=stop_words, min_df=2)),
('ridge',Ridge())
])

In [26]:
kf = KFold(n_splits=5,shuffle=True)
all_desc_train = np.array(all_desc_train)
all_desc_test = np.array(all_desc_test)

for train_index, test_index in kf.split(train_pic):
    pipeline.fit(all_desc_train[train_index], train_pic[train_index])
    pred = pipeline.predict(all_desc_train[test_index])
    output = evaluate(pred, train_pic[test_index])
    print(output)

(37.6075, 0.48457500000000003)
(36.19, 0.50375)
(33.9805, 0.48605)
(35.2325, 0.509925)
(33.248, 0.49900000000000005)
