In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import pairwise_distances
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from sklearn.decomposition import PCA
import itertools

In [2]:
import nltk
import string

def lower(word):
    return word.lower()

lemm = nltk.stem.WordNetLemmatizer()
def lemmatize(word):
    return lemm.lemmatize(word)

punct = set(string.punctuation)
def no_punctuation(word):
    return ''.join(c for c in word if c not in punct)

stop_words = set(nltk.corpus.stopwords.words("english"))
def no_stop_words(word):
    return word if word not in stop_words else ''

strategy_map = {'lo':lower,'lem':lemmatize,
                'punct':no_punctuation,'stop':no_stop_words}

def preprocess(docs,strategies):
    for strategy in strategies:
        new_docs = []
        for doc in docs:
            new_doc = []
            for word in doc:
                transformed = strategy_map[strategy](word)
                if transformed:
                    new_doc.append(transformed)
            new_docs.append(new_doc)
        docs = new_docs
    return docs

In [24]:
documents_train = []
for i in range(10000):
    with open('../descriptions_train/%d.txt' % (i,)) as f:
        documents_train.append(f.read().split())
documents_train = preprocess(documents_train,['lo','punct'])

documents_test = []
for i in range(2000):
    with open('../descriptions_test/%d.txt' % (i,)) as f:
        documents_test.append(f.read().split())
documents_test = preprocess(documents_test,['lo','punct'])

tag_docs_train = []
for i in range(10000):
    with open('../tags_train/%d.txt' % (i,),'r') as f:
        tag_docs_train.append([word for line in f.read().split('\n') for word in line.split(':') if word])
tag_docs_train = preprocess(tag_docs_train,['lo','punct'])

tag_docs_test = []
for i in range(2000):
    with open('../tags_test/%d.txt' % (i,),'r') as f:
        tag_docs_test.append([word for line in f.read().split('\n') for word in line.split(':') if word])
tag_docs_test = preprocess(tag_docs_test,['lo','punct'])

In [36]:
tfidf = TfidfVectorizer(stop_words=stop_words,min_df=2)

In [39]:
tfidf.fit([' '.join(doc) for copus in [documents_train, tag_docs_train] for doc in copus]);

In [49]:
text_train = np.array(tfidf.transform([' '.join(doc) for doc in documents_train]).todense())
text_test = np.array(tfidf.transform([' '.join(doc) for doc in documents_test]).todense())

In [50]:
tags_train = np.array(tfidf.transform([' '.join(doc) for doc in tag_docs_train]).todense())
tags_test = np.array(tfidf.transform([' '.join(doc) for doc in tag_docs_test]).todense())

In [51]:
text_train.shape,text_test.shape

((10000, 5510), (2000, 5510))

In [52]:
tags_train.shape,tags_test.shape

((10000, 5510), (2000, 5510))

In [53]:
pd.Series(np.abs(tags_test.mean(0))).describe()

count    5510.000000
mean        0.000374
std         0.004740
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         0.233749
dtype: float64

In [10]:
feats_train = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv',header=None)
feats_train_b = pd.read_csv('../features_train/features_resnet1000_train.csv',header=None)
feats_test = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv',header=None)
feats_test_b = pd.read_csv('../features_test/features_resnet1000_test.csv',header=None)

In [11]:
feats_train_b.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
0,images_train/5373.jpg,-0.89945,-0.93047,-2.503365,-3.172499,-2.819133,0.992159,-3.698863,0.619991,0.956148,...,-3.021916,2.214253,-1.382491,1.672911,1.014233,2.599949,2.773284,-2.066632,0.385754,-3.24132
1,images_train/984.jpg,-1.346954,-3.119461,-0.765971,-1.38255,-1.104675,-3.656271,-4.815436,-0.556942,-1.402286,...,0.011003,-3.968805,-2.694711,-4.19648,-2.880234,-1.210742,-1.605143,-4.859987,-0.83767,-0.967604
2,images_train/7127.jpg,-3.445498,-1.524573,-1.001654,-3.668335,-1.805517,-1.633496,-7.127826,-1.147802,-1.055816,...,-2.991777,-2.628053,-2.971074,-2.537039,-1.707429,1.013672,0.60846,-3.714998,-0.484735,0.138767


In [12]:
feats_train.shape,feats_test.shape

((10000, 2049), (2000, 2049))

In [13]:
pics_train = np.zeros((10000,3048))
for _,row in feats_train.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_train[i,:2048] = row.values[1:]
for _,row in feats_train_b.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_train[i,2048:] = row.values[1:]

In [14]:
pd.Series(pics_train.mean(0)).describe()

count    3048.000000
mean        0.286300
std         0.726504
min        -3.890733
25%         0.259784
50%         0.384236
75%         0.526699
max         4.441978
dtype: float64

In [15]:
pics_test = np.zeros((2000,3048))
for _,row in feats_test.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_test[i,:2048] = row.values[1:]
for _,row in feats_test_b.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_test[i,2048:] = row.values[1:]

In [16]:
pd.Series(pics_train.mean(0)).describe()

count    3048.000000
mean        0.286300
std         0.726504
min        -3.890733
25%         0.259784
50%         0.384236
75%         0.526699
max         4.441978
dtype: float64

In [54]:
pics_train = np.hstack([pics_train,tags_train])
pics_test = np.hstack([pics_test,tags_test])

In [21]:
# np.save('text_train_full',text_train)
# np.save('pics_train_full',pics_train)
# np.save('text_test_full',text_test)
# np.save('pics_test_full',pics_test)
text_train = np.load('text_train_full.npy')
pics_train = np.load('pics_train_full.npy')
text_test = np.load('text_test_full.npy')
pics_test = np.load('pics_test_full.npy')

In [22]:
pca = PCA(n_components=100)
pca.fit(pics_train)
pics_train = pca.transform(pics_train)
pics_test = pca.transform(pics_test)

In [23]:
text_train.shape,pics_train.shape

((10000, 5510), (10000, 100))

In [24]:
class PrepareData(Dataset):
    def __init__(self, X, y):
        if not torch.is_tensor(X):
            self.X = torch.tensor(X, requires_grad=True)
        if not torch.is_tensor(y):
            self.y = torch.tensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [59]:
def fit(X,y,X_val,y_val,early_stop_window,H_1):
    losses, train_ave, train_map, val_ave, val_map, n_epochs = {}, {}, {}, {}, {}, 0
    
    ds = PrepareData(X=X, y=y)
    dl = DataLoader(ds, batch_size=32, shuffle=True)
    
    device = torch.device('cpu')
    
    D_in, D_out = X.shape[1], y.shape[1]
    
    model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H_1),
#           torch.nn.Dropout(0.2),
#           torch.nn.BatchNorm1d(H_1),
          torch.nn.ReLU(),
          torch.nn.Linear(H_1, D_out),
        ).to(device)
    
    def loss_fn(y_pred, y):
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
        return 1 - cos(y_pred, y).mean()
    #     return torch.norm((y_pred-y), p=2, dim=1).mean()
    
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.00001, weight_decay=.3)
    for t in range(100000):
        for ix, (_x, _y) in enumerate(dl):
            _x = Variable(_x).float()
            _y = Variable(_y).float()

            y_pred = model(_x)

            loss = loss_fn(y_pred, _y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        y_pred = model(Variable(ds.X).float())
        train_loss = loss_fn(y_pred, Variable(ds.y).float()).data.numpy()
        losses[t] = train_loss
        print(t,train_loss)

        ave, map_ = evaluate(y_pred.data.numpy(),ds.y.data.numpy())
        train_ave[t] = ave * 2000/X.shape[0]
        train_map[t] = map_ * X.shape[0]/2000
        y_pred_val = model(Variable(torch.from_numpy(X_val)).float())
        ave_val, map_val = evaluate(y_pred_val.data.numpy(),y_val)
        val_ave[t] = ave_val * 2000/X_val.shape[0]
        val_map[t] = map_val * X_val.shape[0]/2000
        n_epochs = t
        print("""Iter %d: Train Ave Rank: %g, Train MAP@20: %g,
         Val Ave Rank: %g, Val MAP@20: %g""" % (t,train_ave[t],train_map[t],val_ave[t],val_map[t]))
        
        if t>=early_stop_window:
            if val_map[t]<val_map[t-early_stop_window]:
                print("EARLY STOP TRIGGERED BECAUSE NO IMPROVEMENT FOR %d ITERATIONS" % (early_stop_window,))
                return model, losses, train_ave, train_map, val_ave, val_map, n_epochs-early_stop_window

#         if (t+1)%15==0:
        for param_group in optimizer.param_groups:
            param_group['lr'] /= 1.05
    
    return model, losses, train_ave, train_map, val_ave, val_map, n_epochs

def predict(model,X):
    return model(Variable(torch.from_numpy(X)).float()).data.numpy()

def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)

def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])

def evaluate(vectors,label_vectors):
    preds = get_prediction(vectors,label_vectors)
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean(ranks),map_20(ranks)

In [60]:
def get_pic_mats(n_components):
    pics_train = np.load('pics_train_full.npy')
    pics_test = np.load('pics_test_full.npy')

    pca = PCA(n_components=n_components)
    pca.fit(pics_train)
    pics_train = pca.transform(pics_train)
    pics_test = pca.transform(pics_test)
    return pics_train, pics_test

In [61]:
n_components = [100]#[,300]#[30,100,300]
n_hidden_units = [1024]#[2048,4096]
# results = {}

for n_c, n_h in itertools.product(n_components,n_hidden_units):
    print("************ n_c=%d, n_h=%d ************" % (n_c,n_h))
    pics_train, _ = get_pic_mats(n_c)
    cv = KFold(n_splits=5,shuffle=True,random_state=55)
    n_epochs_results = []
    val_map_results = []
    for train_index,test_index in cv.split(text_train,pics_train):
        rets = fit(text_train[train_index],pics_train[train_index],
                   text_train[test_index],pics_train[test_index],5,n_h)
        model, losses, train_ave, train_map, val_ave, val_map, n_epochs = rets
        n_epochs_results.append(n_epochs)
        val_map_results.append(val_map)
    print(n_epochs_results,np.mean(n_epochs_results))
    best_epoch = int(np.mean(n_epochs_results))
    print([d[best_epoch] for d in val_map_results],np.mean([d[best_epoch] for d in val_map_results]))
    results[(n_c, n_h)] = np.mean([d[best_epoch] for d in val_map_results])

************ n_c=100, n_h=1024 ************
0 0.98762804
Iter 0: Train Ave Rank: 951.187, Train MAP@20: 0.00555,
         Val Ave Rank: 954.666, Val MAP@20: 0.006075
1 0.9780992
Iter 1: Train Ave Rank: 920.949, Train MAP@20: 0.0062,
         Val Ave Rank: 925.435, Val MAP@20: 0.00675
2 0.9693056
Iter 2: Train Ave Rank: 901.446, Train MAP@20: 0.00675,
         Val Ave Rank: 905.915, Val MAP@20: 0.007225
3 0.9601917
Iter 3: Train Ave Rank: 886.559, Train MAP@20: 0.0075,
         Val Ave Rank: 891.121, Val MAP@20: 0.007575
4 0.9492116
Iter 4: Train Ave Rank: 872.596, Train MAP@20: 0.008125,
         Val Ave Rank: 876.976, Val MAP@20: 0.00835
5 0.9337236
Iter 5: Train Ave Rank: 853.795, Train MAP@20: 0.0099,
         Val Ave Rank: 858.316, Val MAP@20: 0.00905
6 0.9016691
Iter 6: Train Ave Rank: 819.796, Train MAP@20: 0.01305,
         Val Ave Rank: 825.316, Val MAP@20: 0.010525
7 0.4695031
Iter 7: Train Ave Rank: 140.881, Train MAP@20: 0.2092,
         Val Ave Rank: 149.142, Val MAP@20: 0.

2 0.96127844
Iter 2: Train Ave Rank: 914.399, Train MAP@20: 0.006525,
         Val Ave Rank: 915.773, Val MAP@20: 0.00665
3 0.9522704
Iter 3: Train Ave Rank: 901.448, Train MAP@20: 0.00685,
         Val Ave Rank: 903.578, Val MAP@20: 0.007175
4 0.94184256
Iter 4: Train Ave Rank: 889.186, Train MAP@20: 0.006825,
         Val Ave Rank: 892.119, Val MAP@20: 0.00755
5 0.9272547
Iter 5: Train Ave Rank: 874.87, Train MAP@20: 0.007175,
         Val Ave Rank: 878.59, Val MAP@20: 0.008075
6 0.8987426
Iter 6: Train Ave Rank: 844.585, Train MAP@20: 0.008925,
         Val Ave Rank: 850.041, Val MAP@20: 0.00945
7 0.71477306
Iter 7: Train Ave Rank: 587.44, Train MAP@20: 0.035,
         Val Ave Rank: 602.401, Val MAP@20: 0.0346
8 0.28031492
Iter 8: Train Ave Rank: 51.4485, Train MAP@20: 0.735975,
         Val Ave Rank: 68.319, Val MAP@20: 0.306375
9 0.2372706
Iter 9: Train Ave Rank: 29.7291, Train MAP@20: 1.20625,
         Val Ave Rank: 49.3485, Val MAP@20: 0.3879
10 0.21855235
Iter 10: Train Ave Ran

In [55]:
np.mean([d[len(d)-1] for d in val_map_results])

0.41942500000000005

In [72]:
results

{(30, 1024): 0.43669,
 (30, 2048): 0.436705,
 (30, 4096): 0.436365,
 (100, 1024): 0.46622,
 (100, 2048): 0.4646450000000001,
 (100, 4096): 0.45801499999999995}

In [62]:
def fit(X,y,n_epochs,H_1):
    ds = PrepareData(X=X, y=y)
    dl = DataLoader(ds, batch_size=32, shuffle=True)
    
    device = torch.device('cpu')
    
    D_in, D_out = X.shape[1], y.shape[1]
    
    model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H_1),
#           torch.nn.Dropout(0.2),
#           torch.nn.BatchNorm1d(H_1),
          torch.nn.ReLU(),
          torch.nn.Linear(H_1, D_out),
        ).to(device)
    
    def loss_fn(y_pred, y):
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
        return 1 - cos(y_pred, y).mean()
    #     return torch.norm((y_pred-y), p=2, dim=1).mean()
    
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.00001*5/6, weight_decay=.3)
    for t in range(n_epochs):
        for ix, (_x, _y) in enumerate(dl):
            _x = Variable(_x).float()
            _y = Variable(_y).float()

            y_pred = model(_x)

            loss = loss_fn(y_pred, _y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        y_pred = model(Variable(ds.X).float())
        train_loss = loss_fn(y_pred, Variable(ds.y).float()).data.numpy()
        losses[t] = train_loss
        print(t,train_loss)

        ave, map_ = evaluate(y_pred.data.numpy(),ds.y.data.numpy())
        ave = ave * 2000/X.shape[0]
        map_ = map_ * X.shape[0]/2000
        print("""Iter %d: Train Ave Rank: %g, Train MAP@20: %g""" % (t,train_ave[t],train_map[t]))

#         if (t+1)%15==0:
        for param_group in optimizer.param_groups:
            param_group['lr'] /= 1.05
    
    return model

def predict(model,X):
    return model(Variable(torch.from_numpy(X)).float()).data.numpy()

def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)

def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])

def evaluate(vectors,label_vectors):
    preds = get_prediction(vectors,label_vectors)
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean(ranks),map_20(ranks)

In [63]:
pics_train, pics_test = get_pic_mats(100)

In [65]:
text_train.shape,pics_train.shape

((10000, 5510), (10000, 100))

In [66]:
model = fit(text_train,pics_train,16,1024)

0 0.9829637
Iter 0: Train Ave Rank: 955.643, Train MAP@20: 0.005675
1 0.97357994
Iter 1: Train Ave Rank: 928.7, Train MAP@20: 0.006125
2 0.9654285
Iter 2: Train Ave Rank: 910.014, Train MAP@20: 0.0068
3 0.9573029
Iter 3: Train Ave Rank: 895.333, Train MAP@20: 0.0074
4 0.9484356
Iter 4: Train Ave Rank: 882.816, Train MAP@20: 0.00775
5 0.9372075
Iter 5: Train Ave Rank: 868.229, Train MAP@20: 0.00875
6 0.9200603
Iter 6: Train Ave Rank: 846.321, Train MAP@20: 0.01085
7 0.8745588
Iter 7: Train Ave Rank: 773.941, Train MAP@20: 0.01375
8 0.3563701
Iter 8: Train Ave Rank: 83.6556, Train MAP@20: 0.479525
9 0.26080847
Iter 9: Train Ave Rank: 36.9147, Train MAP@20: 1.02687
10 0.23464012
Iter 10: Train Ave Rank: 23.9927, Train MAP@20: 1.4091
11 0.22113079
Iter 11: Train Ave Rank: 19.2188, Train MAP@20: 1.63575
12 0.20989472
Iter 12: Train Ave Rank: 15.4017, Train MAP@20: 1.85567
13 0.20302242
Iter 13: Train Ave Rank: 12.998, Train MAP@20: 2.04625
14 0.19477022
Iter 14: Train Ave Rank: 11.6335, Tra

In [67]:
def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)

def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])

def evaluate(vectors,label_vectors):
    preds = get_prediction(vectors,label_vectors)
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean(ranks),map_20(ranks)

def get_top_20(descr_id):
    return preds[descr_id][:20]

def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)

In [68]:
vecs = predict(model,text_test)
preds = get_prediction(vecs,pics_test)
save_submission()

In [70]:
vecs.shape

(2000, 100)

In [71]:
preds[0]

array([ 714, 1011, 1380, ...,  644, 1313, 1114])