In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA

import nltk
from nltk.stem.lancaster import LancasterStemmer
from textblob import TextBlob
import string
import os
import re
import gc
import itertools
from copy import deepcopy as copy
from collections import Counter

import tensorflow_hub as hub
import gensim

In [2]:
### Description for train data
desc_files = len(os.listdir('../descriptions_train'))
all_desc_train = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_train/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_train.append(empty_str)

In [3]:
### Tags for train data
tag_files = len(os.listdir('../tags_train'))
all_tags_train = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_train/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_train.append(nouns.replace('\n', ' '))

In [4]:
### Description for test data
desc_files = len(os.listdir('../descriptions_test'))
all_desc_test = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_test/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_test.append(empty_str)

In [5]:
### Tags for test data
tag_files = len(os.listdir('../tags_test'))
all_tags_test = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_test/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_test.append(nouns.replace('\n', ' '))

In [6]:
all_docs = []
all_docs.extend(all_desc_train)
all_docs.extend(all_desc_test)
all_docs.extend(all_tags_train)
all_docs.extend(all_tags_test)

In [7]:
train_1000 = pd.read_csv('../features_train/features_resnet1000_train.csv', header=None)
train_2048 = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv', header=None)
test_1000 = pd.read_csv('../features_test/features_resnet1000_test.csv', header=None)
test_2048 = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv', header=None)

In [8]:
def get_num(string):
    string = string.replace('.', ' ').replace('/', ' ')
    num = [int(s) for s in string.split() if s.isdigit()]
    return num[0]

def parse_to_numpy(pd):
    images_idx = []
    for string in pd[0]:
        images_idx.append(get_num(string))

    pd.insert(1, "Image_Index", images_idx, True)
    pd = pd.sort_values(by=['Image_Index'])
    pd = pd.reset_index(drop=True)
    del pd['Image_Index']
    del pd[0]
    np = pd.to_numpy()
    return np

In [9]:
train_1000 = parse_to_numpy(train_1000)
train_2048 = parse_to_numpy(train_2048)
test_1000 = parse_to_numpy(test_1000)
test_2048 = parse_to_numpy(test_2048)

In [10]:
### Google embedding
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
train_desc_tf = embed(all_desc_train).numpy()
test_desc_tf = embed(all_desc_test).numpy()
train_tags_tf = embed(all_tags_train).numpy()
test_tags_tf = embed(all_tags_test).numpy()

In [11]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
doc = all_desc_train[0]

In [13]:
doc

'The skateboarder is putting on a show using the picnic table as his stage. A skateboarder pulling tricks on top of a picnic table. A man riding on a skateboard on top of a table. A skate boarder doing a trick on a picnic table. A person is riding a skateboard on a picnic table with a crowd watching. '

In [16]:
no_punctuation(doc)

'The skateboarder is putting on a show using the picnic table as his stage A skateboarder pulling tricks on top of a picnic table A man riding on a skateboard on top of a table A skate boarder doing a trick on a picnic table A person is riding a skateboard on a picnic table with a crowd watching '

In [17]:
remove_stops(doc)

'skateboarder putting show using picnic table stage. skateboarder pulling tricks top picnic table. man riding skateboard top table. skate boarder trick picnic table. person riding skateboard picnic table crowd watching.'

In [18]:
filter_non_nouns(doc)

'skateboarder show picnic table stage skateboarder tricks top picnic table man skateboard top table skate boarder trick picnic table person skateboard picnic table crowd watching'

In [19]:
filter_non_nouns('we woman went')

'woman'

In [15]:
stemmer = LancasterStemmer()
stop_words = set(nltk.corpus.stopwords.words('english'))

punct = set(string.punctuation)
def no_punctuation(word):
    return ''.join(c for c in word if c not in punct)

def remove_stops(doc):
    return ' '.join(word for word in doc.split() if no_punctuation(word).lower() not in stop_words)

def stem_doc(doc):
    return ' '.join(stemmer.stem(word) for word in doc.split())

def filter_non_nouns(doc):
    blob = TextBlob(doc)
    nouns = [noun[0] for noun in filter(lambda x:x[1] in ['NN', "NNS"], blob.tags)]
    return ' '.join(nouns)

def get_tfidf_vectors(stem,filter_nn,sublin_tf,ngram_tfidf,pca_comps):
    desc_train_ = copy(all_desc_train)
    desc_test_ = copy(all_desc_test)
    tags_train_ = copy(all_tags_train)
    tags_test_ = copy(all_tags_test)
    
    desc_train_ = [remove_stops(doc) for doc in desc_train_]
    desc_test_ = [remove_stops(doc) for doc in desc_test_]
    tags_train_ = [remove_stops(doc) for doc in tags_train_]
    tags_test_ = [remove_stops(doc) for doc in tags_test_]
    
    if stem:
        desc_train_ = [stem_doc(doc) for doc in desc_train_]
        desc_test_ = [stem_doc(doc) for doc in desc_test_]
        tags_train_ = [stem_doc(doc) for doc in tags_train_]
        tags_test_ = [stem_doc(doc) for doc in tags_test_]
    
    if filter_nn:
        desc_train_ = [filter_non_nouns(doc) for doc in desc_train_]
        desc_test_ = [filter_non_nouns(doc) for doc in desc_test_]
        tags_train_ = [filter_non_nouns(doc) for doc in tags_train_]
        tags_test_ = [filter_non_nouns(doc) for doc in tags_test_]
    
    train_docs_ = desc_train_ + tags_train_
    vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2,sublinear_tf=sublin_tf,
                                 ngram_range=(1,2) if ngram_tfidf else (1,1))
    vectorizer.fit(train_docs_)

    train_desc_bow = np.array(vectorizer.transform(desc_train_).todense())
    test_desc_bow = np.array(vectorizer.transform(desc_test_).todense())
    train_tags_bow = np.array(vectorizer.transform(tags_train_).todense())
    test_tags_bow = np.array(vectorizer.transform(tags_test_).todense())
    
    pca = PCA(n_components=pca_comps)
    pca.fit(np.vstack([train_desc_bow,test_desc_bow,train_tags_bow,test_tags_bow]))
    train_desc_bow_pca = pca.transform(train_desc_bow)
    test_desc_bow_pca = pca.transform(test_desc_bow)
    train_tags_bow_pca = pca.transform(train_tags_bow)
    test_tags_bow_pca = pca.transform(test_tags_bow)
    
    return train_desc_bow_pca, test_desc_bow_pca, train_tags_bow_pca, test_tags_bow_pca

In [20]:
# word2vec
word2vec_embed = hub.load("https://tfhub.dev/google/Wiki-words-500/2")

def get_word2vec_vectors(filter_nn,sublin_tf):
    desc_train_ = copy(all_desc_train)
    desc_test_ = copy(all_desc_test)
    tags_train_ = copy(all_tags_train)
    tags_test_ = copy(all_tags_test)
    
    desc_train_ = [no_punctuation(doc).lower() for doc in desc_train_]
    desc_test_ = [no_punctuation(doc).lower() for doc in desc_test_]
    tags_train_ = [no_punctuation(doc).lower() for doc in tags_train_]
    tags_test_ = [no_punctuation(doc).lower() for doc in tags_test_]
    
    desc_train_ = [remove_stops(doc) for doc in desc_train_]
    desc_test_ = [remove_stops(doc) for doc in desc_test_]
    tags_train_ = [remove_stops(doc) for doc in tags_train_]
    tags_test_ = [remove_stops(doc) for doc in tags_test_]
    
    if filter_nn:
        desc_train_ = [filter_non_nouns(doc) for doc in desc_train_]
        desc_test_ = [filter_non_nouns(doc) for doc in desc_test_]
        tags_train_ = [filter_non_nouns(doc) for doc in tags_train_]
        tags_test_ = [filter_non_nouns(doc) for doc in tags_test_]
    
    word2vec_docs = desc_train_ + desc_test_ + tags_train_ + tags_test_
    w2v_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=1)
    w2v_vectorizer.fit(word2vec_docs)
    
    word2vec_train_desc = np.zeros((10000,500))
    for i in range(10000):
        doc = desc_train_[i]
        words = doc.split()
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            if sublin_tf:
                weight = (1+np.log(word_counter[word]))*w2v_vectorizer.idf_[index] ## tfidf weight
            else:
                weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_train_desc[i] += weight*np.ravel(word2vec_embed([word]))
            total += weight
        word2vec_train_desc[i] /= total

    word2vec_test_desc = np.zeros((2000,500))
    for i in range(2000):
        doc = desc_test_[i]
        words = doc.split()
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            if sublin_tf:
                weight = (1+np.log(word_counter[word]))*w2v_vectorizer.idf_[index] ## tfidf weight
            else:
                weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_test_desc[i] += weight*np.ravel(word2vec_embed([word]))
            total += weight
        word2vec_test_desc[i] /= total
    
    word2vec_train_tags = np.zeros((10000,500))
    for i in range(10000):
        doc = tags_train_[i]
        words = doc.split()
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            if sublin_tf:
                weight = (1+np.log(word_counter[word]))*w2v_vectorizer.idf_[index] ## tfidf weight
            else:
                weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_train_tags[i] += weight*np.ravel(word2vec_embed([word]))
            total += weight
        if total!=0:
            word2vec_train_tags[i] /= total

    word2vec_test_tags = np.zeros((2000,500))
    for i in range(2000):
        doc = tags_test_[i]
        words = doc.split()
        num_words = len(words)
        word_counter = Counter(words)
        total = 0
        for word in set(words):
            if word not in w2v_vectorizer.vocabulary_:
                continue
            index = w2v_vectorizer.vocabulary_[word]
            if sublin_tf:
                weight = (1+np.log(word_counter[word]))*w2v_vectorizer.idf_[index] ## tfidf weight
            else:
                weight = word_counter[word]*w2v_vectorizer.idf_[index] ## tfidf weight
            word2vec_test_tags[i] += weight*np.ravel(word2vec_embed([word]))
            total += weight
        if total!=0:
            word2vec_test_tags[i] /= total
    
    return word2vec_train_desc, word2vec_test_desc, word2vec_train_tags, word2vec_test_tags

In [46]:
def get_prediction_cos(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.T.argsort(1)
def get_prediction_euc(vecs,pics):
    dists = pairwise_distances(vecs,pics)
    return dists.T.argsort(1)
def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])
def map_20_2(ranks):
    return np.mean([1/(1+rank) if rank<20 else 0 for rank in ranks])
def evaluate(vectors,label_vectors,w1,w2,w3):
    vectors = np.copy(vectors)
    label_vectors = np.copy(label_vectors)
    vectors[:,:512] = w1*vectors[:,:512]
    vectors[:,512:1212] = w2*vectors[:,512:1212]
    vectors[:,1212:] = w3*vectors[:,1212:]
    vectors = vectors/(w1+w2+w3)
    label_vectors[:,:512] = w1*label_vectors[:,:512]
    label_vectors[:,512:1212] = w2*label_vectors[:,512:1212]
    label_vectors[:,1212:] = w3*label_vectors[:,1212:]
    label_vectors = label_vectors/(w1+w2+w3)
    preds1 = get_prediction_cos(vectors,label_vectors)
    ranks1 = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds1)]
    preds2 = get_prediction_euc(vectors,label_vectors)
    ranks2 = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds2)]
    return np.mean(ranks1),np.mean(ranks2),map_20(ranks1),map_20(ranks2),map_20_2(ranks1),map_20_2(ranks2)
def get_top_20(descr_id):
    return preds[descr_id][:20]
def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)

In [23]:
stem_tfidf = [True,False]
filter_nn_tfidf = [False]
sublin_tf_tfidf = [True]
ngram_tfidf = [True,False]
filter_nn_w2v = [False]
sublin_tf_w2v = [True]
pca_comps = [700,900,1100]
#results = {}
#tfidf_cache, w2v_cache = {}, {}
for st_ti,fn_ti,sl_ti,ng_ti,n_c,fn_wv,sl_wv in itertools.product(stem_tfidf,filter_nn_tfidf,
                                                                 sublin_tf_tfidf,ngram_tfidf,pca_comps,
                                                                 filter_nn_w2v,sublin_tf_w2v):
    gc.collect()
    
    if (st_ti,sl_ti,ng_ti,n_c) in results:
        continue
    
    if (st_ti,fn_ti,sl_ti,ng_ti,n_c) in tfidf_cache:
        train_desc_bow_pca, train_tags_bow_pca = tfidf_cache[(st_ti,fn_ti,sl_ti,ng_ti,n_c)]
    else:
        train_desc_bow_pca, _, train_tags_bow_pca, _ = get_tfidf_vectors(st_ti,fn_ti,sl_ti,ng_ti,n_c)
        tfidf_cache[(st_ti,fn_ti,sl_ti,ng_ti,n_c)] = (train_desc_bow_pca, train_tags_bow_pca)
    
    if (fn_wv,sl_wv) in w2v_cache:
        word2vec_train_desc, word2vec_train_tags = w2v_cache[(fn_wv,sl_wv)]
    else:
        word2vec_train_desc, _, word2vec_train_tags, _ = get_word2vec_vectors(fn_wv,sl_wv)
        w2v_cache[(fn_wv,sl_wv)] = (word2vec_train_desc, word2vec_train_tags)
    
    train_desc = np.hstack((train_desc_tf, train_desc_bow_pca, word2vec_train_desc))
    train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca, word2vec_train_tags))
    
    kf = KFold(n_splits=5)
    rcv = RidgeCV(alphas=np.linspace(1,40,20))
    res_ = []
    for train_index, test_index in kf.split(train_pic):
        rcv.fit(train_pic[train_index], train_desc[train_index])
        pred = rcv.predict(train_pic[test_index])
        output = evaluate(pred, train_desc[test_index])
        print(output)
        res_.append(output[-1])
    
    results[(st_ti,sl_ti,ng_ti,n_c)] = np.mean(res_)
    print(sorted(results.items(),key=lambda x: x[1],reverse=True))

(4.2965, 4.2435, 0.8232, 0.8284, 0.5730184995517038, 0.5768876451936437)
(4.0505, 3.7445, 0.8275249999999998, 0.836825, 0.5833570839527689, 0.5892947439235172)
(4.4555, 4.343, 0.8230999999999999, 0.82685, 0.5830236257765282, 0.5891563196542879)
(3.8695, 3.7345, 0.831225, 0.836475, 0.579873676654443, 0.5809441326904948)
(4.2235, 4.112, 0.8278, 0.8299249999999999, 0.5828499868230324, 0.5819055849744511)
[((True, True, False, 900), 0.5883695643739644), ((True, True, False, 700), 0.5880004074464408), ((True, True, False, 1100), 0.5874003503892049), ((True, True, True, 1100), 0.5843608096143621), ((True, True, True, 900), 0.5836769117333475), ((False, True, True, 700), 0.5836376852872789), ((True, True, True, 700), 0.5814386591646228), ((True, False, True, 700), 0.5693471030337052)]
(4.293, 4.2345, 0.82325, 0.82845, 0.57414600576582, 0.5782496102903804)
(4.0195, 3.726, 0.82905, 0.837475, 0.5852342213879171, 0.5898927006730799)
(4.424, 4.3015, 0.823875, 0.827975, 0.5842359075522001, 0.589901

In [24]:
train_desc_bow_pca, train_tags_bow_pca = tfidf_cache[(False,False,True,False,700)]
train_desc = np.hstack((train_desc_tf, train_desc_bow_pca))
train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca))
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
res_ = []
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)

(4.4195, 4.396, 0.821425, 0.8272, 0.5748273957702944, 0.5778663969243689)
(4.099, 3.7885, 0.8257000000000001, 0.835875, 0.5815175083903025, 0.5841478713258706)
(4.5585, 4.44, 0.818425, 0.8235750000000001, 0.5836825363383608, 0.5863966236034347)
(3.9135, 3.7205, 0.8317749999999999, 0.8389, 0.5822963375397392, 0.5840442069948456)
(4.3575, 4.1735, 0.82325, 0.8274, 0.579281077558922, 0.579444107584452)


In [25]:
train_desc_bow_pca, _, train_tags_bow_pca, _ = get_tfidf_vectors(False,False,True,False,1300)
word2vec_train_desc, word2vec_train_tags = w2v_cache[(False,True)]
train_desc = np.hstack((train_desc_tf, train_desc_bow_pca, word2vec_train_desc))
train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca, word2vec_train_tags))
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
res_ = []
for train_index, test_index in kf.split(train_pic):
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    output = evaluate(pred, train_desc[test_index])
    print(output)
    res_.append(output[-1])
np.mean(res_)

(4.301, 4.249, 0.82425, 0.8305, 0.5784054270505896, 0.5842499941707758)
(4.0135, 3.698, 0.830025, 0.838275, 0.5866917884338743, 0.590748007410546)
(4.458, 4.341, 0.822625, 0.828, 0.5893651392553095, 0.5947456455760441)
(3.8415, 3.6515, 0.834125, 0.8412000000000001, 0.5888657914045877, 0.5911152591947096)
(4.17, 4.0355, 0.828625, 0.83175, 0.5882588127837934, 0.5894257836590654)


0.5900569380022282

In [43]:
from collections import defaultdict

In [47]:
train_desc_bow_pca, train_tags_bow_pca = tfidf_cache[(False,False,True,False,700)]
word2vec_train_desc, word2vec_train_tags = w2v_cache[(False,True)]
train_desc = np.hstack((train_desc_tf, train_desc_bow_pca, word2vec_train_desc))
train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca, word2vec_train_tags))

weights_ = [1,2,3]
results_weightings = defaultdict(list)
kf = KFold(n_splits=5)
rcv = RidgeCV(alphas=np.linspace(1,40,20))
for train_index, test_index in kf.split(train_pic):
    print('print')
    rcv.fit(train_pic[train_index], train_desc[train_index])
    pred = rcv.predict(train_pic[test_index])
    for w1,w2,w3 in itertools.product(*[weights_]*3):
        output = evaluate(pred, train_desc[test_index],w1,w2,w3)
        results_weightings[(w1,w2,w3)].append(output[-1])

for w1,w2,w3 in itertools.product(*[weights_]*3):
    results_weightings[(w1,w2,w3)] = np.mean(results_weightings[(w1,w2,w3)])

print
print
print
print
print


In [48]:
sorted(results_weightings.items(),key=lambda x: x[1], reverse=True)

[((2, 2, 3), 0.5919601457211519),
 ((2, 3, 3), 0.5911941395249057),
 ((1, 1, 1), 0.5903301488900675),
 ((2, 2, 2), 0.5903301488900675),
 ((3, 3, 3), 0.5903301488900675),
 ((1, 1, 2), 0.5896408128438468),
 ((2, 3, 2), 0.587138431424097),
 ((3, 3, 2), 0.586284240767832),
 ((1, 2, 2), 0.5860891286628748),
 ((1, 2, 3), 0.5848957465492024),
 ((2, 2, 1), 0.584596070082738),
 ((2, 3, 1), 0.5832316105544783),
 ((3, 3, 1), 0.5829515530582249),
 ((3, 2, 3), 0.5829358850531994),
 ((1, 2, 1), 0.5787123629170108),
 ((3, 2, 2), 0.5781396779433157),
 ((2, 1, 3), 0.5778691071883912),
 ((1, 3, 3), 0.5770755709740036),
 ((2, 1, 2), 0.5745975562724169),
 ((3, 2, 1), 0.5738383507703168),
 ((1, 3, 2), 0.5715706832645339),
 ((3, 1, 3), 0.5675047832800155),
 ((2, 1, 1), 0.5672605879560756),
 ((1, 1, 3), 0.566640583877337),
 ((1, 3, 1), 0.5649502234117791),
 ((3, 1, 2), 0.5621398572355576),
 ((3, 1, 1), 0.5560987050213289)]

In [49]:
train_desc_bow_pca, test_desc_bow_pca, train_tags_bow_pca, test_tags_bow_pca = get_tfidf_vectors(False,False,True,False,700)
word2vec_train_desc, word2vec_test_desc, word2vec_train_tags, word2vec_test_tags = get_word2vec_vectors(False,True)

train_desc = np.hstack((train_desc_tf, train_desc_bow_pca, word2vec_train_desc))
test_desc = np.hstack((test_desc_tf, test_desc_bow_pca, word2vec_test_desc))
train_pic = np.hstack((train_1000, train_tags_tf, train_tags_bow_pca, word2vec_train_tags))
test_pic = np.hstack((test_1000, test_tags_tf, test_tags_bow_pca, word2vec_test_tags))

In [50]:
train_desc.shape, test_desc.shape, train_pic.shape, test_pic.shape

((10000, 1712), (2000, 1712), (10000, 2712), (2000, 2712))

In [52]:
def weight_wrapper(vectors, label_vectors, w1, w2, w3):
    vectors = np.copy(vectors)
    label_vectors = np.copy(label_vectors)
    vectors[:,:512] = w1*vectors[:,:512]
    vectors[:,512:1212] = w2*vectors[:,512:1212]
    vectors[:,1212:] = w3*vectors[:,1212:]
    vectors = vectors/(w1+w2+w3)
    label_vectors[:,:512] = w1*label_vectors[:,:512]
    label_vectors[:,512:1212] = w2*label_vectors[:,512:1212]
    label_vectors[:,1212:] = w3*label_vectors[:,1212:]
    label_vectors = label_vectors/(w1+w2+w3)
    return vectors,label_vectors

In [None]:
reg = RidgeCV(alphas=np.linspace(1,40,20))
reg.fit(train_pic, train_desc)
print("best reg:",reg.alpha_)
prediction = reg.predict(test_pic)
prediction_, test_desc_ = weight_wrapper(prediction, test_desc, 2, 2, 3)
preds = get_prediction_euc(prediction_, test_desc_)
save_submission()

Things to try:
* Filtering out non-nouns before TFIDF
* Filtering out non-nounds for word2vec
* Stemming before TFIDF
* Sublinear TF in tfidf
* Sublinear TF in word2vec
* Number of PCA dims for BOW