In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import pairwise_distances
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold

In [2]:
import nltk
import string

def lower(word):
    return word.lower()

lemm = nltk.stem.WordNetLemmatizer()
def lemmatize(word):
    return lemm.lemmatize(word)

punct = set(string.punctuation)
def no_punctuation(word):
    return ''.join(c for c in word if c not in punct)

stop_words = set(nltk.corpus.stopwords.words("english"))
def no_stop_words(word):
    return word if word not in stop_words else ''

strategy_map = {'lo':lower,'lem':lemmatize,
                'punct':no_punctuation,'stop':no_stop_words}

def preprocess(docs,strategies):
    for strategy in strategies:
        new_docs = []
        for doc in docs:
            new_doc = []
            for word in doc:
                transformed = strategy_map[strategy](word)
                if transformed:
                    new_doc.append(transformed)
            new_docs.append(new_doc)
        docs = new_docs
    return docs

In [3]:
documents_train = []
for i in range(10000):
    with open('../descriptions_train/%d.txt' % (i,)) as f:
        documents_train.append(f.read().split())
documents_train = preprocess(documents_train,['lo','punct'])
documents_train_new = []
for i in range(10000):
    documents_train_new.append(TaggedDocument(documents_train[i],[i]))
documents_train = documents_train_new

In [4]:
model = Doc2Vec(documents_train, vector_size=100, window=10, 
                min_count=2, workers=4)

In [5]:
model.train(documents_train,total_examples=model.corpus_count,epochs=100)

In [6]:
documents_train[44]

TaggedDocument(words=['two', 'giraffes', 'in', 'a', 'room', 'with', 'people', 'looking', 'at', 'them', 'two', 'giraffe', 'standing', 'next', 'to', 'each', 'other', 'in', 'a', 'room', 'the', 'giraffe', 'is', 'being', 'kept', 'by', 'itself', 'indoors', 'a', 'man', 'and', 'woman', 'staring', 'at', 'two', 'giraffes', 'through', 'a', 'window', 'a', 'giraffe', 'in', 'a', 'enclosed', 'area', 'is', 'watched', 'by', 'some', 'people'], tags=[44])

In [7]:
model.wv.most_similar("dog",topn=12)

[('cat', 0.5827369689941406),
 ('man', 0.5552079081535339),
 ('child', 0.5540258288383484),
 ('person', 0.5419774055480957),
 ('dogs', 0.5106233358383179),
 ('sheep', 0.5050454139709473),
 ('house', 0.5029844045639038),
 ('boy', 0.49989622831344604),
 ('doughnut', 0.49748361110687256),
 ('hotdog', 0.4972448945045471),
 ('snowboard', 0.4694925546646118),
 ('surfboard', 0.4642629027366638)]

In [8]:
text_train = np.array([model[i] for i in range(10000)])

In [9]:
text_test = np.zeros((2000,text_train.shape[1]))
documents_test = []
for i in range(2000):
    with open('../descriptions_test/%d.txt' % (i,)) as f:
        documents_test.append(f.read().split())
documents_test = preprocess(documents_test,['lo','punct'])
for i in range(2000):
    text_test[i] = model.infer_vector(documents_test[i])

In [10]:
text_train = np.hstack([text_train,np.ones((10000,1))])
text_test = np.hstack([text_test,np.ones((2000,1))])

In [11]:
text_train.shape,text_test.shape

((10000, 101), (2000, 101))

In [12]:
feats_train = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv',header=None)
feats_train_b = pd.read_csv('../features_train/features_resnet1000_train.csv',header=None)
feats_test = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv',header=None)
feats_test_b = pd.read_csv('../features_test/features_resnet1000_test.csv',header=None)

In [13]:
feats_train_b.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
0,images_train/5373.jpg,-0.89945,-0.93047,-2.503365,-3.172499,-2.819133,0.992159,-3.698863,0.619991,0.956148,...,-3.021916,2.214253,-1.382491,1.672911,1.014233,2.599949,2.773284,-2.066632,0.385754,-3.24132
1,images_train/984.jpg,-1.346954,-3.119461,-0.765971,-1.38255,-1.104675,-3.656271,-4.815436,-0.556942,-1.402286,...,0.011003,-3.968805,-2.694711,-4.19648,-2.880234,-1.210742,-1.605143,-4.859987,-0.83767,-0.967604
2,images_train/7127.jpg,-3.445498,-1.524573,-1.001654,-3.668335,-1.805517,-1.633496,-7.127826,-1.147802,-1.055816,...,-2.991777,-2.628053,-2.971074,-2.537039,-1.707429,1.013672,0.60846,-3.714998,-0.484735,0.138767
3,images_train/9609.jpg,1.11465,-2.167102,0.097881,-1.336255,0.853483,-0.374885,-2.36909,-2.273191,-1.143788,...,-1.248134,-0.633126,-1.723514,-2.638832,0.097149,4.647974,1.030138,-2.193836,1.044024,0.176043
4,images_train/5293.jpg,1.60265,-1.505817,3.029409,4.092412,1.711755,6.271253,4.173686,-2.177313,0.747789,...,-1.285806,-2.266481,-3.898053,2.295787,-1.749552,0.974188,1.258117,-1.975622,-1.278643,-1.941441


In [14]:
feats_train.shape,feats_test.shape

((10000, 2049), (2000, 2049))

In [15]:
pics_train = np.zeros((10000,3048))
for _,row in feats_train.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_train[i,:2048] = row.values[1:]
for _,row in feats_train_b.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_train[i,2048:] = row.values[1:]

In [16]:
pd.Series(pics_train.mean(0)).describe()

count    3048.000000
mean        0.286300
std         0.726504
min        -3.890733
25%         0.259784
50%         0.384236
75%         0.526699
max         4.441978
dtype: float64

In [17]:
pics_test = np.zeros((2000,3048))
for _,row in feats_test.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_test[i,:2048] = row.values[1:]
for _,row in feats_test_b.iterrows():
    try:
        i = int(row[0].split('/')[1].split('.jpg')[0])
    except:
        i = int(row[0].split('/')[1].split('..jpg')[0])
    pics_test[i,2048:] = row.values[1:]

In [18]:
pd.Series(pics_train.mean(0)).describe()

count    3048.000000
mean        0.286300
std         0.726504
min        -3.890733
25%         0.259784
50%         0.384236
75%         0.526699
max         4.441978
dtype: float64

In [19]:
tags_train = np.zeros((10000,100))
tags_test = np.zeros((2000,100))
tag_docs_train, tag_docs_test = [], []
for i in range(10000):
    with open('../tags_train/%d.txt' % (i,),'r') as f:
        tag_docs_train.append([word for line in f.read().split('\n') for word in line.split(':') if word])
for i in range(2000):
    with open('../tags_test/%d.txt' % (i,),'r') as f:
        tag_docs_test.append([word for line in f.read().split('\n') for word in line.split(':') if word])
tag_docs_train = preprocess(tag_docs_train,['lo','punct'])
tag_docs_test = preprocess(tag_docs_test,['lo','punct'])
for i in range(10000):
    tags_train[i] = model.infer_vector(tag_docs_train[i])
for i in range(2000):
    tags_test[i] = model.infer_vector(tag_docs_test[i])

In [20]:
pd.Series(np.abs(tags_test.mean(1))).describe()

count    2000.000000
mean        0.072021
std         0.067333
min         0.000013
25%         0.022011
50%         0.053870
75%         0.101180
max         0.450649
dtype: float64

In [21]:
pics_train = np.hstack([pics_train,tags_train])
pics_test = np.hstack([pics_test,tags_test])

In [23]:
# pca = PCA(n_components=101)
# pca.fit(pics_train)
# pics_train = pca.transform(pics_train)
# pics_test = pca.transform(pics_test)

In [22]:
def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)#[:,:20]

def get_top_20(descr_id):
    return preds[descr_id][:20]

def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)
    
def map_20(preds):
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])

In [3]:
# np.save('text_train_full',text_train)
# np.save('pics_train_full',pics_train)
text_train = np.load('text_train_full.npy')
pics_train = np.load('pics_train_full.npy')

In [6]:
text_train.shape,pics_train.shape

((10000, 101), (10000, 3148))

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
rfc = RandomForestClassifier(n_estimators=50,max_depth=15,n_jobs=4,max_features=None)

In [9]:
i

12609

In [10]:
a,b

(837, 3900)

In [40]:
X = np.zeros((15000,3249))
y = np.zeros((15000,1))
for i in range(10000):
    X[i] = np.concatenate([text_train[i],pics_train[i]])
    y[i] = 1

seen = set([])
for i in range(10000,15000):
    a,b = np.random.randint(0,10000,size=2)
    while a==b or (a,b) in seen:
        a,b = np.random.randint(0,10000,size=2)
    X[i] = np.concatenate([text_train[a],pics_train[b]])
    y[i] = 0
    seen.add((a,b))

In [41]:
X.shape, y.shape

((15000, 3249), (15000, 1))

In [42]:
y = np.ravel(y)
cv = StratifiedKFold(n_splits=6,shuffle=True)
probs_negative, probs_positive = [], []
for train_index,test_index in cv.split(X,y):
    rfc.fit(X[train_index],y[train_index])
    preds = rfc.predict_proba(X[test_index])
    probs_negative.extend(np.ravel(preds[y[test_index]==0][:,1]).tolist())
    probs_positive.extend(np.ravel(preds[y[test_index]==1][:,1]).tolist())

KeyboardInterrupt: 

In [43]:
np.mean(probs_negative),np.mean(probs_positive)

(0.6741185637375107, 0.6661652977058632)

In [28]:
rfc.classes_

array([0., 1.])

In [29]:
tree = rfc.estimators_[0].tree_

In [31]:
tree.max_depth

15

In [24]:
preds

array([[0.82695159, 0.17304841],
       [0.83706945, 0.16293055],
       [0.80336708, 0.19663292],
       ...,
       [0.78551618, 0.21448382],
       [0.85234898, 0.14765102],
       [0.77499846, 0.22500154]])

In [23]:
y[test_index]

array([1., 1., 1., ..., 0., 0., 0.])

In [22]:
probs_negative

[0.8266150193809242,
 0.17338498061907592,
 0.7990812083380462,
 0.20091879166195412,
 0.8108351458682472,
 0.18916485413175285,
 0.8015126550092682,
 0.19848734499073178,
 0.7587363641019789,
 0.2412636358980208,
 0.8033884127314,
 0.19661158726859976,
 0.818692046025957,
 0.18130795397404312,
 0.7896820307005659,
 0.2103179692994345,
 0.7897180571209003,
 0.21028194287909965,
 0.7583644895754235,
 0.24163551042457632,
 0.7898278920214216,
 0.21017210797857863,
 0.776627601426133,
 0.22337239857386704,
 0.8243466874543186,
 0.17565331254568137,
 0.7758914737967808,
 0.22410852620321922,
 0.8069450651834695,
 0.19305493481653088,
 0.7685030854518716,
 0.23149691454812843,
 0.794979370592475,
 0.2050206294075253,
 0.8010626104301322,
 0.198937389569868,
 0.8247106524302183,
 0.17528934756978162,
 0.8229274792245951,
 0.177072520775405,
 0.8420014904647739,
 0.15799850953522562,
 0.7892829738034313,
 0.21071702619656907,
 0.7437816160185665,
 0.25621838398143343,
 0.7797751855990542,
 0.