In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

In [2]:
### Description for train data
desc_files = len(os.listdir('../descriptions_train'))
all_desc_train = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_train/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_train.append(empty_str)

In [3]:
### Tags for train data
tag_files = len(os.listdir('../tags_train'))
all_tags_train = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_train/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_train.append(nouns.replace('\n', ' '))

In [4]:
### Description for test data
desc_files = len(os.listdir('../descriptions_test'))
all_desc_test = []

for i in range(desc_files):
    empty_str = ''
    for line in open(f'../descriptions_test/{i}.txt'):
        empty_str += line.replace('\n',' ')
    all_desc_test.append(empty_str)

In [5]:
### Tags for test data
tag_files = len(os.listdir('../tags_test'))
all_tags_test = []

for i in range(tag_files):
    nouns = ''
    for line in open(f'../tags_test/{i}.txt'):
        nouns += line.replace(':',' ')
    all_tags_test.append(nouns.replace('\n', ' '))

In [6]:
all_docs = []
all_docs.extend(all_desc_train)
all_docs.extend(all_desc_test)
all_docs.extend(all_tags_train)
all_docs.extend(all_tags_test)

In [7]:
vectorizer = TfidfVectorizer(stop_words=‘english’, min_df=2)
vectorizer.fit(all_docs);

In [8]:
train_desc = vectorizer.transform(all_desc_train)
test_desc = vectorizer.transform(all_desc_test)
train_tags = vectorizer.transform(all_tags_train)
test_tags = vectorizer.transform(all_tags_test)

In [9]:
train_desc = np.array(train_desc.todense())
test_desc = np.array(test_desc.todense())
train_tags = np.array(train_tags.todense())
test_tags = np.array(test_tags.todense())

In [7]:
train_1000 = pd.read_csv('../features_train/features_resnet1000_train.csv', header=None)
train_2048 = pd.read_csv('../features_train/features_resnet1000intermediate_train.csv', header=None)
test_1000 = pd.read_csv('../features_test/features_resnet1000_test.csv', header=None)
test_2048 = pd.read_csv('../features_test/features_resnet1000intermediate_test.csv', header=None)

In [8]:
def get_num(string):
    string = string.replace('.', ' ').replace('/', ' ')
    num = [int(s) for s in string.split() if s.isdigit()]
    return num[0]

def parse_to_numpy(pd):
    images_idx = []
    for string in pd[0]:
        images_idx.append(get_num(string))

    pd.insert(1, "Image_Index", images_idx, True)
    pd = pd.sort_values(by=['Image_Index'])
    pd = pd.reset_index(drop=True)
    del pd['Image_Index']
    del pd[0]
    np = pd.to_numpy()
    return np

In [9]:
train_1000 = parse_to_numpy(train_1000)
train_2048 = parse_to_numpy(train_2048)
test_1000 = parse_to_numpy(test_1000)
test_2048 = parse_to_numpy(test_2048)

In [10]:
train = np.hstack((train_1000, train_2048, train_tags))
test = np.hstack((test_1000, test_2048, test_tags))

NameError: name 'train_tags' is not defined

In [None]:
train_desc.shape, train.shape, test_desc.shape, test.shape

In [None]:
### Set PCA Dimensions
pca = PCA(n_components = 100)
pca.fit(train)
train = pca.transform(train)
test = pca.transform(test)

In [None]:
train_desc.shape, train.shape, test_desc.shape, test.shape

In [None]:
def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)
def map_20(ranks):
    return np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])
def evaluate(vectors,label_vectors):
    preds = get_prediction(vectors,label_vectors)
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    return np.mean(ranks),map_20(ranks)
def get_top_20(descr_id):
    return preds[descr_id][:20]
def save_submission():
    data = []
    for i in range(2000):
        data.append(['%d.txt' % (i,),' '.join('%d.jpg' % (pic_id,) for pic_id in get_top_20(i))])
    pd.DataFrame(data,columns=['Descritpion_ID','Top_20_Image_IDs']).to_csv('submission.csv',index=False)

In [38]:
kf = KFold(n_splits=5)
regr = RandomForestRegressor(verbose=3, n_jobs=-1, max_depth=15)

for train_index, test_index in kf.split(train):
    regr.fit(train_desc[train_index], train[train_index])
    pred = regr.predict(train_desc[test_index])
    output = evaluate(pred, train[test_index])
    print(output)

[2000 2001 2002 ... 9997 9998 9999] [   0    1    2 ... 1997 1998 1999]
[   0    1    2 ... 9997 9998 9999] [2000 2001 2002 ... 3997 3998 3999]
[   0    1    2 ... 9997 9998 9999] [4000 4001 4002 ... 5997 5998 5999]
[   0    1    2 ... 9997 9998 9999] [6000 6001 6002 ... 7997 7998 7999]
[   0    1    2 ... 7997 7998 7999] [8000 8001 8002 ... 9997 9998 9999]
