## AML FINAL PROJECT ##
by: Zeheng Wang, Benjamin Hwang

### 1. Word2Vec models:##

#### a) Preprocsesing:



**Import word2vec and libraries needed. Define the training, dev, and test numbers**

In [None]:
import os
import csv
import random
import gensim
import numpy as np
import pandas as pd

num_train = 8000
num_dev = 2000
num_test = 2000
split_idx = list(range(num_train + num_dev))
random.shuffle(split_idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
print("Loaded word vectors successfully!")

**Helper functions to help process the descriptions**

In [None]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

def doc_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    # return average
    return np.stack(word_vecs).mean(0)

**Add additional preprocessing to get rid of stop words from descriptions**

In [None]:
#Word preprocessing libraries

import nltk
from nltk.corpus import stopwords
from nltk.stem import *
import string

# Downloads if necessary:

# nltk.download('stopwords')
# nltk.download('wordnet')

def preprocess(docs):
    docs = list(map(lambda a: a.lower(), docs))
    docs = list(map(lambda a: a.translate(str.maketrans('','', string.punctuation)), docs))
    
    stop = stopwords.words('english')
    docs = list(map(lambda x: ' '.join([word for word in x.split() if word not in (stop)]), docs))

    #Lemmatization of all the words
    lem = WordNetLemmatizer()
    docs = list(map(lambda x: ' '.join([lem.lemmatize(word) for word in x.split()]), docs))
    
    return docs

**Create the initial description features**

In [None]:
# build x matrices

train_dev_desc = preprocess(parse_descriptions("descriptions_train", num_doc=(num_train+num_dev)))
test_desc = preprocess(parse_descriptions("descriptions_test", num_doc=num_test))
x_train_desc = np.array([doc_to_vec(train_dev_desc[i], word2vec) for i in split_idx[:num_train]])
x_dev_desc = np.array([doc_to_vec(train_dev_desc[i], word2vec) for i in split_idx[num_train:]])
x_test_desc = np.array([doc_to_vec(d, word2vec) for d in test_desc])

print("Built all x matrices!")
print("x_train shape:", x_train_desc.shape)
print("x_dev shape:", x_dev_desc.shape)
print("x_test shape:", x_test_desc.shape)

**Helper function to parse the ResNet features to be predicted**

In [None]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

**Use SVD to compress intermediate ResNet features for an optimal representation of ResNet for prediction**

In [None]:
# define a helper function to generate the matrix needed given a specific rank
def FMatrix_Generator(Matrix, vh, r):
    F_Mat = np.matmul(Matrix, np.transpose(vh[:r,:]))
    
    return F_Mat

y_original = parse_features("features_train/features_resnet1000intermediate_train.csv")
u, s, vh = np.linalg.svd(y_original, full_matrices = True)
y_train_dev = FMatrix_Generator(y_original, vh, 50)
y_train = y_train_dev[split_idx[:num_train]]

y_dev = y_train_dev[split_idx[num_train:]]

y_test_original = parse_features("features_test/features_resnet1000intermediate_test.csv")
y_test = FMatrix_Generator(y_test_original, vh, 50)

print("Built all y matrices!")
print("y_train shape:", y_train.shape)
print("y_dev shape:", y_dev.shape)
print("y_test shape:", y_test.shape)

#### b) Models:

**Ridge Regression:**

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# train OLS model with regression
parameters = {"alpha": [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]}
reg = GridSearchCV(Ridge(), parameters, cv = 10)
reg.fit(x_train, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

**Random Forest:**

In [None]:
## TRAIN A RANDOM FOREST REGRESSOR TO TEST:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

#parameters = {"n_estimators": [5, 10, 20, 50, 100], "max_depth": [2, 5, 10, 20, 50]}
#rf_reg = GridSearchCV(RandomForestRegressor(), parameters, cv = 10)
#rf_reg.fit(x_train, y_train)
#rf_reg_best = rf_reg.best_estimator_
#print(rf_reg_best)

rf_reg = RandomForestRegressor(n_estimators = 5, max_depth = 50)
rf_reg.fit(x_train_conc, y_train)
rf_reg_best = rf_reg

**Kernel Ridge Regression:**

In [None]:
# Try Kernel Ridge Regression

from sklearn.kernel_ridge import KernelRidge
parameters = {"alpha": [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0], "kernel": ['linear', 'rbf', 'poly']}
reg = GridSearchCV(KernelRidge(), parameters, cv = 10)
reg.fit(x_train_conc, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

#### c) Performance:

**Now score the models and assess performance on dev**

In [None]:
from scipy.spatial import distance

def cdist_matrix(x1, x2):
    return distance.cdist(x1, x2, 'cosine')

# test performance on development set
y_dev_pred = reg.predict(x_dev)
dev_distances = cdist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []

for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

**Final Output**

In [None]:
# create test predictions
x_train_all = np.concatenate([x_train, x_dev])
y_train_all = np.concatenate([y_train, y_dev])
reg_best.fit(x_train_all, y_train_all)

# testing to make sure this works
#reg.fit(x_train_all, y_train_all)
#y_test_pred = reg.predict(x_test)

y_test_pred = reg_best.predict(x_test)
test_distances = cdist_matrix(y_test_pred, y_test)
pred_rows = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("test_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

### 1. Bag of Words models:##

**a) Preprocessing:**

**Represent the Training Data as a bag of words:**


In [None]:
## Create Bag of words instead of word2vec for training data

train_dev_desc = preprocess(parse_descriptions("descriptions_train", num_doc=(num_train + num_dev)))
test_desc = preprocess(parse_descriptions("descriptions_test", num_doc = num_test))

bag_train_list = []
bag_array = []
for i in train_dev_desc:
    bag_array.append(i.split())
    
for i in bag_array:
    for j in i:
        bag_train_list.append(j)
        
train_dictionary = list(set(bag_train_list))

train_dict = []
for i in bag_array:
    unique, counts = np.unique(i, return_counts = True)
    train_dict.append(dict(zip(unique,counts)))
    
# pandas data frame to store the unique list of words per training set:
import pandas as pd

train_bag_of_words = pd.DataFrame(0, index=np.arange(len(train_dict)), columns = train_dictionary)

# write the data into each 
for cols in train_bag_of_words:
    for i in range(len(train_dict)):
        if cols in train_dict[i].keys():
            train_bag_of_words[cols][i] = train_dict[i][cols]


**Now Convert the Test Data into a bag of words**

In [None]:
## Create Bag of words instead of word2vec for testing

bag_test_list = []
bag_test_array = []
for i in test_desc:
    bag_test_array.append(i.split())
    
for i in bag_test_array:
    for j in i:
        bag_test_list.append(j)
        

test_dict = []
for i in bag_test_array:
    unique, counts = np.unique(i, return_counts = True)
    test_dict.append(dict(zip(unique,counts)))
    
# pandas data frame to store the unique list of words per training set:
import pandas as pd

test_bag_of_words = pd.DataFrame(0, index=np.arange(len(test_dict)), columns = train_dictionary)

# write the data into each 
for cols in test_bag_of_words:
    for i in range(len(test_dict)):
        if cols in test_dict[i].keys():
            test_bag_of_words[cols][i] = test_dict[i][cols]


**Now Create the Training and Test X Matrices**

In [None]:
train_dev_bag = np.array(train_bag_of_words)
test_bag = np.array(test_bag_of_words)

x_train = train_dev_bag[split_idx[:num_train]]
x_dev = train_dev_bag[split_idx[num_train:]]
x_test = test_bag

print("Built all x matrices!")
print("x_train shape:", x_train.shape)
print("x_dev shape:", x_dev.shape)
print("x_test shape:", x_test.shape)

**Y Features**

In [None]:
# define a helper function to generate the matrix needed given a specific rank
def FMatrix_Generator(Matrix, vh, r):
    F_Mat = np.matmul(Matrix, np.transpose(vh[:r,:]))
    
    return F_Mat

y_original = parse_features("features_train/features_resnet1000intermediate_train.csv")
u, s, vh = np.linalg.svd(y_original, full_matrices = True)
y_train_dev = FMatrix_Generator(y_original, vh, 50)
y_train = y_train_dev[split_idx[:num_train]]

y_dev = y_train_dev[split_idx[num_train:]]

y_test_original = parse_features("features_test/features_resnet1000intermediate_test.csv")
y_test = FMatrix_Generator(y_test_original, vh, 50)

print("Built all y matrices!")
print("y_train shape:", y_train.shape)
print("y_dev shape:", y_dev.shape)
print("y_test shape:", y_test.shape)

#### b) Models:

**Ridge Regression**

In [None]:
parameters = {"alpha": [10]}
reg = GridSearchCV(Ridge(), parameters, cv=10)
# reg = MultiTaskElasticNetCV(cv=5, random_state=0)


reg.fit(x_train, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

#### c) Performance:

**Now score the models and assess performance on dev**

In [None]:
from scipy.spatial import distance

def cdist_matrix(x1, x2):
    return distance.cdist(x1, x2, 'cosine')

# test performance on development set
y_dev_pred = reg.predict(x_dev)
dev_distances = cdist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []

for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

**Final Output**

In [None]:
# create test predictions
x_train_all = np.concatenate([x_train, x_dev])
y_train_all = np.concatenate([y_train, y_dev])
reg_best.fit(x_train_all, y_train_all)

# testing to make sure this works
#reg.fit(x_train_all, y_train_all)
#y_test_pred = reg.predict(x_test)

y_test_pred = reg_best.predict(x_test)
test_distances = cdist_matrix(y_test_pred, y_test)
pred_rows = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("test_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")