## Alternative Methods Explored: ##

In [1]:
import os
import csv
import random
import gensim
import numpy as np
import pandas as pd



In [2]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    
    return docs

def doc_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    # return average
    return np.stack(word_vecs).mean(0)

**Code for Experiementing with noun extraction**

In [3]:
#TODO: Real Preprocessing

#Word preprocessing
import nltk

# Downloads if necessary:

# nltk.download('stopwords')
# nltk.download('wordnet')
##nltk.download('punkt')
##nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.stem import *
import string

def preprocess(docs):
    docs = list(map(lambda a: a.lower(), docs))
    docs = list(map(lambda a: a.translate(str.maketrans('','', string.punctuation)), docs))
    
    stop = stopwords.words('english')
    docs = list(map(lambda x: ' '.join([word for word in x.split() if word not in (stop)]), docs))

    #Lemmatization of all the words
    lem = WordNetLemmatizer()
    docs = list(map(lambda x: ' '.join([lem.lemmatize(word) for word in x.split()]), docs))
    # docs = list(set(docs))

    # nouns only ******* this is added
    # docs = list(map(lambda x: ' '.join([word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(x)) if pos[0] == 'N']),docs))
    
    return docs

In [64]:
# Test this without the noun extraction first
train_dev_desc = preprocess(parse_descriptions("descriptions_train", num_doc=(10000)))
test_desc = preprocess(parse_descriptions("descriptions_test", num_doc=2000))

**Code for exploring Tfidf (term frequency–inverse document frequency) Vectorization**

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_vectorizer = TfidfVectorizer()
train_vectorizer.fit(train_dev_desc)
X_train = train_vectorizer.transform(train_dev_desc)
X_test = train_vectorizer.transform(test_desc)

**Helper Functions From Original Code**

In [11]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])


In [44]:
# define a helper function to generate the matrix needed given a specific rank
def FMatrix_Generator(Matrix, vh, r):
    F_Mat = np.matmul(Matrix, np.transpose(vh[:r,:]))
    
    return F_Mat

y_original = parse_features("features_train/features_resnet1000intermediate_train.csv")
u, s, vh = np.linalg.svd(y_original, full_matrices = True)
y_train_dev = FMatrix_Generator(y_original, vh, 50)
##y_train = y_train_dev[split_idx[:num_train]]

##y_dev = y_train_dev[split_idx[num_train:]]

y_test_original = parse_features("features_test/features_resnet1000intermediate_test.csv")
y_test = FMatrix_Generator(y_test_original, vh, 50)

print("Built all y matrices!")
#print("y_train shape:", y_train_dev.shape)
#print("y_dev shape:", y_dev.shape)
#print("y_test shape:", y_test.shape)

Built all y matrices!


In [71]:
import numpy as np
from sklearn.model_selection import train_test_split

x_train, x_dev, y_train, y_dev = train_test_split(
    X_train, y_train_dev, test_size = 2000)

**Models Trained and Tested**

In [75]:
# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# train OLS model with regression
parameters = {"alpha": [0, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]} #ALPHA 5 best

#parameters = {"alpha": [5]}
reg = GridSearchCV(Ridge(), parameters, cv=10)
# reg = MultiTaskElasticNetCV(cv=5, random_state=0)

reg.fit(x_train, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

Trained linear regression model!
Summary of best model:
Ridge(alpha=5.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [58]:
from sklearn.linear_model import Lasso

parameters = {"alpha": [0.5]} #ALPHA
reg = GridSearchCV(Lasso(), parameters, cv=10)

reg.fit(x_train, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

Trained linear regression model!
Summary of best model:
Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)


In [53]:
from sklearn.ensemble import RandomForestRegressor

parameters = {"max_depth": [10], "n_estimators": [10]}

reg = GridSearchCV(RandomForestRegressor(), parameters, cv = 10)
reg.fit(x_train, y_train)

#reg = RandomForestRegressor(max_depth = 1000, n_estimators = 3).fit(x_train, y_train)
reg_best = reg


In [76]:
from scipy.spatial import distance

def cdist_matrix(x1, x2):
    return distance.cdist(x1, x2, 'cosine')

def dist_matrix(x1, x2):
    return ((np.expand_dims(x1, 1) - np.expand_dims(x2, 0)) ** 2).sum(2) ** 0.5

# test performance on development set
y_dev_pred = reg.predict(x_dev)
dev_distances = cdist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []

for i in range(2000):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

Development MAP@20: 0.18774034731608258
Mean index of true image 46.252
Median index of true image 14.0


In [46]:
# create test predictions
from scipy.sparse import vstack

x_train_all = vstack([x_train, x_dev])
y_train_all = np.concatenate([y_train, y_dev])
reg_best.fit(x_train_all, y_train_all)
y_test_pred = reg_best.predict(X_test)
test_distances = cdist_matrix(y_test_pred, y_test)
pred_rows = []

for i in range(2000):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("test_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

Output written!


**N-Grams Experimentation:**

In [4]:
## Last thing to test... Using N-Grams:

N_gram_train_desc = preprocess(parse_descriptions("descriptions_train", num_doc=(10000)))
N_gram_test_desc = preprocess(parse_descriptions("descriptions_test", num_doc=(2000)))

In [7]:
from nltk import ngrams

n = 2

bag_train_list = []
bag_array = []
for i in N_gram_train_desc:
    bag_array.append(list(ngrams(i.split(),n)))
    
for i in bag_array:
    for j in i:
        bag_train_list.append(j)
        
train_dictionary = list(set(bag_train_list))

train_dict = []
for i in bag_array:
    unique, counts = np.unique(i, return_counts = True)
    train_dict.append(dict(zip(unique,counts)))
    
# pandas data frame to store the unique list of words per training set:
import pandas as pd

train_bag_of_words = pd.DataFrame(0, index=np.arange(len(train_dict)), columns = train_dictionary)

# write the data into each 
for cols in train_bag_of_words:
    for i in range(len(train_dict)):
        if cols in train_dict[i].keys():
            train_bag_of_words[cols][i] = train_dict[i][cols]

In [94]:
train_bag_of_words.shape

(10000, 107193)

In [8]:
# N-grams Bag of Words

bag_test_list = []
bag_test_array = []
for i in N_gram_test_desc:
    bag_test_array.append(list(ngrams(i.split(),n)))
    
for i in bag_test_array:
    for j in i:
        bag_test_list.append(j)
        

test_dict = []
for i in bag_test_array:
    unique, counts = np.unique(i, return_counts = True)
    test_dict.append(dict(zip(unique,counts)))
    
# pandas data frame to store the unique list of words per training set:
import pandas as pd

test_bag_of_words = pd.DataFrame(0, index=np.arange(len(test_dict)), columns = train_dictionary)

# write the data into each 
for cols in test_bag_of_words:
    for i in range(len(test_dict)):
        if cols in test_dict[i].keys():
            test_bag_of_words[cols][i] = test_dict[i][cols]


In [13]:
train_dev_bag = np.array(train_bag_of_words)

x_test = np.array(test_bag_of_words)

from sklearn.model_selection import train_test_split

x_train, x_dev, y_train, y_dev = train_test_split(
    train_dev_bag, y_train_dev, test_size = 2000)


**Models Tested**

In [17]:
# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# train OLS model with regression
parameters = {"alpha": [50]} #ALPHA 5 best

#parameters = {"alpha": [5]}
reg = GridSearchCV(Ridge(), parameters, cv=10)
# reg = MultiTaskElasticNetCV(cv=5, random_state=0)

reg.fit(x_train, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

Trained linear regression model!
Summary of best model:
Ridge(alpha=50, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [18]:
from scipy.spatial import distance

def cdist_matrix(x1, x2):
    return distance.cdist(x1, x2, 'cosine')

def dist_matrix(x1, x2):
    return ((np.expand_dims(x1, 1) - np.expand_dims(x2, 0)) ** 2).sum(2) ** 0.5

# test performance on development set
y_dev_pred = reg.predict(x_dev)
dev_distances = cdist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []

for i in range(2000):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

Development MAP@20: 0.0017988698285718407
Mean index of true image 999.5
Median index of true image 999.5


**PCA experimentation**

In [21]:
## PCA ATTEMPT

## Create Bag of words instead of word2vec for training data

train_dev_desc = preprocess(parse_descriptions("descriptions_train", num_doc=(10000)))
test_desc = preprocess(parse_descriptions("descriptions_test", num_doc=2000))

bag_train_list = []
bag_array = []
for i in train_dev_desc:
    bag_array.append(i.split())
    
for i in bag_array:
    for j in i:
        bag_train_list.append(j)
        
train_dictionary = list(set(bag_train_list))

train_dict = []
for i in bag_array:
    unique, counts = np.unique(i, return_counts = True)
    train_dict.append(dict(zip(unique,counts)))
    
# pandas data frame to store the unique list of words per training set:
import pandas as pd

train_bag_of_words = pd.DataFrame(0, index=np.arange(len(train_dict)), columns = train_dictionary)

# write the data into each 
for cols in train_bag_of_words:
    for i in range(len(train_dict)):
        if cols in train_dict[i].keys():
            train_bag_of_words[cols][i] = train_dict[i][cols]


In [22]:
## Create Bag of words instead of word2vec for testing

bag_test_list = []
bag_test_array = []
for i in test_desc:
    bag_test_array.append(i.split())
    
for i in bag_test_array:
    for j in i:
        bag_test_list.append(j)
        

test_dict = []
for i in bag_test_array:
    unique, counts = np.unique(i, return_counts = True)
    test_dict.append(dict(zip(unique,counts)))
    
# pandas data frame to store the unique list of words per training set:
import pandas as pd

test_bag_of_words = pd.DataFrame(0, index=np.arange(len(test_dict)), columns = train_dictionary)

# write the data into each 
for cols in test_bag_of_words:
    for i in range(len(test_dict)):
        if cols in test_dict[i].keys():
            test_bag_of_words[cols][i] = test_dict[i][cols]


In [60]:
train_bag_of_words.shape

(10000, 8338)

In [70]:
from sklearn.decomposition import PCA

train_dev_bag = np.array(train_bag_of_words)
x_test = np.array(test_bag_of_words)

X = np.concatenate((train_dev_bag, x_test))

pca = PCA(n_components = 4000)

X_pca = pca.fit_transform(X)


In [71]:
x_train_dev = X_pca[:10000]
x_test = X_pca[-2000:]


x_train, x_dev, y_train, y_dev = train_test_split(
    x_train_dev, y_train_dev, test_size = 2000)


In [76]:
# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# train OLS model with regression
#parameters = {"alpha": [20, 200, 300, 1000]} #ALPHA 5 best

parameters = {"alpha": [50, 75, 100, 125]}
reg = GridSearchCV(Ridge(), parameters, cv=10)
# reg = MultiTaskElasticNetCV(cv=5, random_state=0)

reg.fit(x_train, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

Trained linear regression model!
Summary of best model:
Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [77]:
from scipy.spatial import distance

def cdist_matrix(x1, x2):
    return distance.cdist(x1, x2, 'cosine')

def dist_matrix(x1, x2):
    return ((np.expand_dims(x1, 1) - np.expand_dims(x2, 0)) ** 2).sum(2) ** 0.5

# test performance on development set
y_dev_pred = reg.predict(x_dev)
dev_distances = cdist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []

for i in range(2000):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

Development MAP@20: 0.28030375196054375
Mean index of true image 25.142
Median index of true image 7.0


In [78]:
# create test predictions
x_train_all = np.concatenate([x_train, x_dev])
y_train_all = np.concatenate([y_train, y_dev])
reg_best.fit(x_train_all, y_train_all)
y_test_pred = reg_best.predict(x_test)
test_distances = cdist_matrix(y_test_pred, y_test)
pred_rows = []

for i in range(2000):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("test_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

Output written!
