In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import pairwise_distances
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from sklearn.decomposition import PCA

In [2]:
import nltk
import string

def lower(word):
    return word.lower()

lemm = nltk.stem.WordNetLemmatizer()
def lemmatize(word):
    return lemm.lemmatize(word)

punct = set(string.punctuation)
def no_punctuation(word):
    return ''.join(c for c in word if c not in punct)

stop_words = set(nltk.corpus.stopwords.words("english"))
def no_stop_words(word):
    return word if word not in stop_words else ''

strategy_map = {'lo':lower,'lem':lemmatize,
                'punct':no_punctuation,'stop':no_stop_words}

def preprocess(docs,strategies):
    for strategy in strategies:
        new_docs = []
        for doc in docs:
            new_doc = []
            for word in doc:
                transformed = strategy_map[strategy](word)
                if transformed:
                    new_doc.append(transformed)
            new_docs.append(new_doc)
        docs = new_docs
    return docs

In [3]:
documents_train = []
for i in range(10000):
    with open('../descriptions_train/%d.txt' % (i,)) as f:
        documents_train.append(f.read().split())
documents_train = preprocess(documents_train,['lo','punct'])

documents_test = []
for i in range(2000):
    with open('../descriptions_test/%d.txt' % (i,)) as f:
        documents_test.append(f.read().split())
documents_test = preprocess(documents_test,['lo','punct'])

tag_docs_train = []
for i in range(10000):
    with open('../tags_train/%d.txt' % (i,),'r') as f:
        tag_docs_train.append([word for line in f.read().split('\n') for word in line.split(':') if word])
tag_docs_train = preprocess(tag_docs_train,['lo','punct'])

tag_docs_test = []
for i in range(2000):
    with open('../tags_test/%d.txt' % (i,),'r') as f:
        tag_docs_test.append([word for line in f.read().split('\n') for word in line.split(':') if word])
tag_docs_test = preprocess(tag_docs_test,['lo','punct'])

In [4]:
tfidf = TfidfVectorizer(stop_words=stop_words,min_df=2)

In [5]:
tfidf.fit([' '.join(doc) for copus in [documents_train, tag_docs_train, 
                                       documents_test, tag_docs_test] for doc in copus]);

In [6]:
text_train = np.array(tfidf.transform([' '.join(doc) for doc in documents_train]).todense())
text_test = np.array(tfidf.transform([' '.join(doc) for doc in documents_test]).todense())

In [7]:
tags_train = np.array(tfidf.transform([' '.join(doc) for doc in tag_docs_train]).todense())
tags_test = np.array(tfidf.transform([' '.join(doc) for doc in tag_docs_test]).todense())

In [8]:
text_train.shape,text_test.shape

((10000, 5980), (2000, 5980))

In [9]:
tags_train.shape,tags_test.shape

((10000, 5980), (2000, 5980))

In [10]:
pd.Series(np.abs(tags_test.mean(0))).describe()

count    5980.000000
mean        0.000345
std         0.004552
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         0.233815
dtype: float64

In [11]:
feats_train = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv',header=None)
feats_train_b = pd.read_csv('../features_train/features_resnet1000_train.csv',header=None)
feats_test = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv',header=None)
feats_test_b = pd.read_csv('../features_test/features_resnet1000_test.csv',header=None)

In [12]:
feats_train_b.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
0,images_train/5373.jpg,-0.89945,-0.93047,-2.503365,-3.172499,-2.819133,0.992159,-3.698863,0.619991,0.956148,...,-3.021916,2.214253,-1.382491,1.672911,1.014233,2.599949,2.773284,-2.066632,0.385754,-3.24132
1,images_train/984.jpg,-1.346954,-3.119461,-0.765971,-1.38255,-1.104675,-3.656271,-4.815436,-0.556942,-1.402286,...,0.011003,-3.968805,-2.694711,-4.19648,-2.880234,-1.210742,-1.605143,-4.859987,-0.83767,-0.967604
2,images_train/7127.jpg,-3.445498,-1.524573,-1.001654,-3.668335,-1.805517,-1.633496,-7.127826,-1.147802,-1.055816,...,-2.991777,-2.628053,-2.971074,-2.537039,-1.707429,1.013672,0.60846,-3.714998,-0.484735,0.138767


In [13]:
feats_train.shape,feats_test.shape

((10000, 2049), (2000, 2049))

In [14]:
pics_train = np.zeros((10000,3048))
for _,row in feats_train.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_train[i,:2048] = row.values[1:]
for _,row in feats_train_b.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_train[i,2048:] = row.values[1:]

In [15]:
pd.Series(pics_train.mean(0)).describe()

count    3048.000000
mean        0.286300
std         0.726504
min        -3.890733
25%         0.259784
50%         0.384236
75%         0.526699
max         4.441978
dtype: float64

In [16]:
pics_test = np.zeros((2000,3048))
for _,row in feats_test.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_test[i,:2048] = row.values[1:]
for _,row in feats_test_b.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_test[i,2048:] = row.values[1:]

In [17]:
pd.Series(pics_train.mean(0)).describe()

count    3048.000000
mean        0.286300
std         0.726504
min        -3.890733
25%         0.259784
50%         0.384236
75%         0.526699
max         4.441978
dtype: float64

In [18]:
pics_train = np.hstack([pics_train,tags_train])
pics_test = np.hstack([pics_test,tags_test])

In [23]:
np.save('text_train_full_wbert',text_train)
np.save('pics_train_full_wbert',pics_train)
np.save('text_test_full_wbert',text_test)
np.save('pics_test_full_wbert',pics_test)
# text_train = np.load('text_train_full.npy')
# pics_train = np.load('pics_train_full.npy')
# text_test = np.load('text_test_full.npy')
# pics_test = np.load('pics_test_full.npy')

In [21]:
bert_vectors_desc_train = np.load('bert_desc_train.npy')
bert_vectors_tags_train = np.load('bert_tags_train.npy')
bert_vectors_desc_test = np.load('bert_desc_test.npy')
bert_vectors_tags_test = np.load('bert_tags_test.npy')

In [22]:
text_train = np.hstack([text_train,bert_vectors_desc_train])
text_test = np.hstack([text_test,bert_vectors_desc_test])
pics_train = np.hstack([pics_train,bert_vectors_tags_train])
pics_test = np.hstack([pics_test,bert_vectors_tags_test])

In [4]:
pca = PCA(n_components=100)
pca.fit(pics_train)
pics_train = pca.transform(pics_train)
pics_test = pca.transform(pics_test)

In [5]:
text_train.shape,pics_train.shape

((10000, 5510), (10000, 100))

In [6]:
class PrepareData(Dataset):
    def __init__(self, X, y):
        if not torch.is_tensor(X):
            self.X = torch.tensor(X, requires_grad=True)
        if not torch.is_tensor(y):
            self.y = torch.tensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [7]:
def fit(X,y,X_val,y_val,early_stop_window):
    losses, train_ave, train_map, val_ave, val_map, n_epochs = {}, {}, {}, {}, {}, 0
    
    ds = PrepareData(X=X, y=y)
    dl = DataLoader(ds, batch_size=32, shuffle=True)
    
    device = torch.device('cpu')
    
    D_in, H_1, D_out = X.shape[1], 2048, y.shape[1]
    
    model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H_1),
#           torch.nn.Dropout(0.2),
#           torch.nn.BatchNorm1d(H_1),
          torch.nn.ReLU(),
          torch.nn.Linear(H_1, D_out),
        ).to(device)
    
    def loss_fn(y_pred, y):
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
        return 1 - cos(y_pred, y).mean()
    #     return torch.norm((y_pred-y), p=2, dim=1).mean()
    
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001, weight_decay=.2)
    for t in range(100):
        for ix, (_x, _y) in enumerate(dl):
            _x = Variable(_x).float()
            _y = Variable(_y).float()

            y_pred = model(_x)

            loss = loss_fn(y_pred, _y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        y_pred = model(Variable(ds.X).float())
        train_loss = loss_fn(y_pred, Variable(ds.y).float()).data.numpy()
        losses[t] = train_loss
        print(t,train_loss)

        ave, map_ = evaluate(y_pred.data.numpy(),ds.y.data.numpy())
        train_ave[t] = ave * 2000/X.shape[0]
        train_map[t] = map_ * X.shape[0]/2000
        y_pred_val = model(Variable(torch.from_numpy(X_val)).float())
        ave_val, map_val = evaluate(y_pred_val.data.numpy(),y_val)
        val_ave[t] = ave_val * 2000/X_val.shape[0]
        val_map[t] = map_val * X_val.shape[0]/2000
        n_epochs = t
        print("""Iter %d: Train Ave Rank: %g, Train MAP@20: %g,
        Val Ave Rank: %g, Val MAP@20: %g""" % (t,train_ave[t],train_map[t],val_ave[t],val_map[t]))
        
        if t>=early_stop_window:
            if val_map[t]<val_map[t-early_stop_window]:
                print("EARLY STOP TRIGGERED BECAUSE NO IMPROVEMENT FOR %d ITERATIONS" % (early_stop_window,))
                return model, losses, train_ave, train_map, val_ave, val_map, n_epochs-early_stop_window

#         if (t+1)%15==0:
        for param_group in optimizer.param_groups:
            param_group['lr'] /= 1.05
    
    return model, losses, train_ave, train_map, val_ave, val_map, n_epochs

def predict(model,X):
    return model(Variable(torch.from_numpy(X)).float()).data.numpy()

def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)

def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])

def evaluate(vectors,label_vectors):
    preds = get_prediction(vectors,label_vectors)
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean(ranks),map_20(ranks)

In [9]:
cv = KFold(n_splits=5,shuffle=True)
n_epochs_results = []
val_map_results = []
for train_index,test_index in cv.split(text_train,pics_train):
    rets = fit(text_train[train_index],pics_train[train_index],
               text_train[test_index],pics_train[test_index],5)
    model, losses, train_ave, train_map, val_ave, val_map, n_epochs = rets
    n_epochs_results.append(n_epochs)
    val_map_results.append(val_map)
    vectors = predict(model,text_train[test_index])
    print('************Fold eval*************')
    print(evaluate(vectors,pics_train[test_index]))

0 0.30330652
Iter 0: Train Ave Rank: 51.4673, Train MAP@20: 0.83945,
        Val Ave Rank: 65.483, Val MAP@20: 0.3203
1 0.29690957
Iter 1: Train Ave Rank: 48.6177, Train MAP@20: 0.822325,
        Val Ave Rank: 63.887, Val MAP@20: 0.324125
2 0.28876805
Iter 2: Train Ave Rank: 44.3552, Train MAP@20: 0.871475,
        Val Ave Rank: 60.292, Val MAP@20: 0.341375
3 0.28191233
Iter 3: Train Ave Rank: 45.6456, Train MAP@20: 0.869425,
        Val Ave Rank: 63.6815, Val MAP@20: 0.338325
4 0.26997995
Iter 4: Train Ave Rank: 40.9449, Train MAP@20: 0.901575,
        Val Ave Rank: 59.429, Val MAP@20: 0.33415
5 0.26830435
Iter 5: Train Ave Rank: 37.767, Train MAP@20: 0.9653,
        Val Ave Rank: 55.6195, Val MAP@20: 0.35925
6 0.26053423
Iter 6: Train Ave Rank: 34.3361, Train MAP@20: 1.06693,
        Val Ave Rank: 55.5045, Val MAP@20: 0.360075
7 0.26160973
Iter 7: Train Ave Rank: 33.3827, Train MAP@20: 1.10075,
        Val Ave Rank: 52.7645, Val MAP@20: 0.366925
8 0.24565911
Iter 8: Train Ave Rank: 2

KeyboardInterrupt: 

In [None]:
n_epochs_results,np.mean(n_epochs_results),val_map_results,np.mean(val_map_results)

In [12]:
text_train = np.load('text_train_full.npy')
pics_train = np.load('pics_train_full.npy')
text_test = np.load('text_test_full.npy')
pics_test = np.load('pics_test_full.npy')

pca = PCA(n_components=100)
pca.fit(pics_train[:,2048:])
pics_train = pca.transform(pics_train[:,2048:])
pics_test = pca.transform(pics_test[:,2048:])

In [13]:
text_train.shape,pics_train.shape

((10000, 5510), (10000, 100))

In [14]:
cv = KFold(n_splits=5,shuffle=True)
n_epochs_results = []
val_map_results = []
for train_index,test_index in cv.split(text_train,pics_train):
    rets = fit(text_train[train_index],pics_train[train_index],
               text_train[test_index],pics_train[test_index],5)
    model, losses, train_ave, train_map, val_ave, val_map, n_epochs = rets
    n_epochs_results.append(n_epochs)
    val_map_results.append(val_map)
    vectors = predict(model,text_train[test_index])
    print('************Fold eval*************')
    print(evaluate(vectors,pics_train[test_index]))

0 0.29487443
Iter 0: Train Ave Rank: 55.3416, Train MAP@20: 0.81165,
        Val Ave Rank: 68.8935, Val MAP@20: 0.315675
1 0.29591763
Iter 1: Train Ave Rank: 54.9973, Train MAP@20: 0.7379,
        Val Ave Rank: 71.6695, Val MAP@20: 0.29415
2 0.28796297
Iter 2: Train Ave Rank: 49.1473, Train MAP@20: 0.793925,
        Val Ave Rank: 65.9995, Val MAP@20: 0.311475
3 0.2841556
Iter 3: Train Ave Rank: 46.5553, Train MAP@20: 0.806475,
        Val Ave Rank: 65.3375, Val MAP@20: 0.3127
4 0.2745517
Iter 4: Train Ave Rank: 46.2772, Train MAP@20: 0.865425,
        Val Ave Rank: 63.9725, Val MAP@20: 0.33015
5 0.26491535
Iter 5: Train Ave Rank: 37.3458, Train MAP@20: 1.00233,
        Val Ave Rank: 57.8535, Val MAP@20: 0.3528
6 0.2597258
Iter 6: Train Ave Rank: 38.3665, Train MAP@20: 0.973325,
        Val Ave Rank: 59.308, Val MAP@20: 0.34655
7 0.25363666


KeyboardInterrupt: 

In [None]:
n_epochs_results,np.mean(n_epochs_results),val_map_results,np.mean(val_map_results)

In [None]:
# model = fit(text_train,pics_train)

In [72]:
def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)

def get_top_20(descr_id):
    return preds[descr_id][:20]

def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)

In [73]:
vecs = predict(model,text_test)
preds = get_prediction(vecs,pics_test)
save_submission()