In [0]:
import numpy as np
import sklearn
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
base_path = "drive/My Drive/AML/Final/data/"

In [0]:
def flatten_descriptions(file_contents):
  descriptions = []
  labels = []

  for i, description_set in enumerate(file_contents):
    for description in description_set.split("\n"):
      if len(description) == 0: continue

      descriptions.append(description)
      labels.append(i)

  return descriptions, labels



---



In [0]:
def noun_phrases(text):
  blob = TextBlob(text)
  nouns = [noun[0] for noun in filter(lambda x:x[1]=='NN', blob.tags)]
  noun_phrases = blob.noun_phrases

  return set(nouns+noun_phrases)

In [0]:
def preprocess_descr(descriptions):
    mapped_desc = []

    for description in descriptions:
      nouns = noun_phrases(description)

      noun_only_description = [word for word in description.split(" ") if word in nouns]
      mapped_desc.append(" ".join(noun_only_description))

    return mapped_desc

In [0]:
def preprocess_tags(tags_file_contents):
  comp_tags = []

  for i, tag_set in enumerate(tags_file_contents):
    compound_tag = [tag.split(":")[1] if ":" in tag else "" for tag in tag_set.split("\n")]
    comp_tags.append(" ".join(compound_tag))

  return comp_tags

In [0]:
def bag_nouns_vecs(descriptions):
    mapped_desc = preprocess_descr(descriptions)

    vectorizer = TfidfVectorizer(stop_words="english")
    freq_vecs = vectorizer.fit_transform(mapped_desc)

    return freq_vecs.toarray(), vectorizer



---



In [0]:
def load_res_net(path):
  with open(base_path+path, "r") as f:  
    features = []
    labels = []

    for line in f.readlines():
      comps = line.split(",")
      features.append(comps[1:])
      labels.append(int(comps[0].split("/")[1].split(".")[0]))

    features = np.array(features, dtype=float)
    labels = np.array(labels, dtype=int)

    return features, labels



---



In [0]:
descr_file_contents_ftrain = np.load(base_path+"np_descriptions_train.npy")
descr_file_contents_test = np.load(base_path+"np_descriptions_test.npy")

tags_file_contents_ftrain = np.load(base_path+"np_tags_train.npy")
tags_file_contents_test = np.load(base_path+"np_tags_test.npy")

flattened_descr_ftrain, descr_labels_ftrain = flatten_descriptions(descr_file_contents_ftrain)
flattened_descr_test, descr_labels_test = flatten_descriptions(descr_file_contents_test)

In [0]:
descr_vec_ftrain, descr_vectorizer_train = bag_nouns_vecs(flattened_descr_ftrain)
descr_vec_test = descr_vectorizer_train.transform(preprocess_descr(flattened_descr_test)).toarray()

tags_ftrain, tags_vectorizer_train = bag_nouns_vecs(preprocess_tags(tags_file_contents_ftrain))
tags_test = tags_vectorizer_train.transform(preprocess_tags(tags_file_contents_test)).toarray()

res_net_ftrain, res_net_labels_ftrain = load_res_net("features_train/features_resnet1000_train.csv")
res_net_test, res_net_labels_test = load_res_net("features_test/features_resnet1000_test.csv")

res_net_int_ftrain, _ = load_res_net("features_train/features_resnet1000intermediate_train.csv")
res_net_int_test, _ = load_res_net("features_test/features_resnet1000intermediate_test.csv")

res_net_comp_ftrain = np.concatenate((res_net_ftrain, res_net_int_ftrain), axis=1)
res_net_comp_test = np.concatenate((res_net_test, res_net_int_test), axis=1)

In [0]:
print(descr_vec_ftrain.shape)
print(tags_ftrain.shape)
print(res_net_comp_ftrain.shape)

(50032, 4041)
(10000, 87)
(10000, 3048)


In [0]:
def x_y_sets(Y_filter, X, Y):
    Y_filter = set(Y_filter)
    X_res, Y_res = [], []
    
    for x, y in zip(X, Y):
      if y in Y_filter:
        X_res.append(x)
        Y_res.append(y)

    return np.array(X_res), np.array(Y_res)

In [0]:
descr_vec_train, descr_vec_val, descr_labels_train, descr_labels_val = train_test_split(descr_vec_ftrain, 
                                                                                        descr_labels_ftrain,
                                                                                        test_size=0.10)

res_net_comp_train, res_net_labels_train = x_y_sets(descr_labels_train, res_net_comp_ftrain, res_net_labels_ftrain)
res_net_comp_val, res_net_labels_val = x_y_sets(descr_labels_val, res_net_comp_ftrain, res_net_labels_ftrain)

In [0]:
print(descr_vec_train.shape)
print(descr_vec_val.shape)

print(res_net_comp_train.shape)
print(res_net_comp_val.shape)

(45028, 4041)
(5004, 4041)
(10000, 3048)
(4128, 3048)




---



#### Mapping res_net to desc_vec approach

In [0]:
mapping_X_train = []
mapping_Y_train = descr_vec_train

for label in descr_labels_train:
  res_net_idx = np.where(res_net_labels_train == label)[0][0]
  mapping_X_train.append(res_net_comp_train[res_net_idx])

mapping_X_train = np.array(mapping_X_train)

In [0]:
print(mapping_X_train.shape)
print(mapping_Y_train.shape)

(54174, 3048)
(54174, 4041)


In [0]:
from keras.models import Sequential
from keras.layers import Dense

mapping_model = Sequential()
mapping_model.add(Dense(128, input_dim=mapping_X_train.shape[1], activation='sigmoid'))
mapping_model.add(Dense(128, activation='sigmoid'))
mapping_model.add(Dense(64, activation='sigmoid'))
mapping_model.add(Dense(mapping_Y_train.shape[1], activation='sigmoid'))

mapping_model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [0]:
mapping_model.fit(mapping_X_train, mapping_Y_train, epochs=2, batch_size=10)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f5266d60278>

In [0]:
mapped_test_vecs = mapping_model.predict(np.array(res_net_comp_val))

In [0]:
def top_k_mapped_similiar(k, mapped_vecs, mapped_vecs_labels, test_vec, actual_label):
    distances = np.linalg.norm(mapped_vecs-test_vec, axis=1)
    #distances = 1-cosine_similarity(mapped_vecs, [test_vec])

    sorted_indices = np.argsort(distances)
    rank = np.where(np.array(mapped_vecs_labels)[sorted_indices] == actual_label)[0][0]

    predicted = np.array(mapped_vecs_labels)[sorted_indices[:k]]
    return predicted, rank

In [0]:
predicted = []
actual = []
ranks = []

index_pool = list(range(descr_vec_val.shape[0]))

for i in random.sample(index_pool, 1000):
  descr_vec, label = descr_vec_val[i], descr_labels_val[i]

  top_k, rank = top_k_mapped_similiar(20, mapped_test_vecs, res_net_labels_val, descr_vec, label)
  predicted.append(label if label in top_k else top_k[0])
  actual.append(label)
  ranks.append(rank)

In [0]:
sklearn.metrics.accuracy_score(actual, predicted)

0.007

In [0]:
np.mean(ranks)

2242.878

#### Similarity approach


In [0]:
def has_two_pos_entries_at_same_idx(v1, v2):
  assert(len(v1) == len(v2))

  for x1, x2 in zip(v1, v2):
    if x1 > 0 and x2 > 0:
      return True

  return False

In [0]:
sim_X_train = []
sim_Y_train = []

for descr_vec, label in zip(descr_vec_train, descr_labels_train):
  res_net_idx = np.where(res_net_labels_train == label)[0][0]
  comp_vec = np.concatenate((descr_vec, res_net_comp_train[res_net_idx]))
  sim_X_train.append(comp_vec)
  sim_Y_train.append(1)

  acceptable = False

  while not acceptable:
    res_net_idx = random.randint(0, res_net_comp_train.shape[0]-1)
    image_label = res_net_labels_train[res_net_idx]
    descr_vec_2_idx = np.where(descr_labels_train == image_label)[0][0]
    descr_vec_2 = descr_vec_train[descr_vec_2_idx]

    if not has_two_pos_entries_at_same_idx(descr_vec, descr_vec_2):
      comp_vec = np.concatenate((descr_vec, res_net_comp_train[res_net_idx]))
      sim_X_train.append(comp_vec)
      sim_Y_train.append(0)
      acceptable = True

sim_X_train = np.array(sim_X_train)
sim_Y_train = np.array(sim_Y_train)

In [0]:
print(sim_X_train.shape)
print(sim_Y_train.shape)

(90056, 7089)
(90056,)


In [0]:
# from keras.models import Sequential
# from keras.layers import Dense

# sim_model = Sequential()
# sim_model.add(Dense(3500, input_dim=sim_X_train.shape[1], activation='relu'))
# sim_model.add(Dense(1, activation='sigmoid'))

# sim_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# sim_model.fit(sim_X_train, sim_Y_train, epochs=180, batch_size=32)

Epoch 1/180
Epoch 2/180
Epoch 3/180
Epoch 4/180
Epoch 5/180
Epoch 6/180
Epoch 7/180
Epoch 8/180
Epoch 9/180
Epoch 10/180
Epoch 11/180
Epoch 12/180
18496/90056 [=====>........................] - ETA: 2:50 - loss: 7.9669 - acc: 0.5003

KeyboardInterrupt: ignored

In [0]:
#sim_model = RandomForestClassifier()
#sim_model.fit(sim_X_train, sim_Y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
def concatenate_per_row(A, B):
    m1,n1 = A.shape
    m2,n2 = B.shape

    out = np.zeros((m1,m2,n1+n2),dtype=A.dtype)
    out[:,:,:n1] = A[:,None,:]
    out[:,:,n1:] = B
    return out.reshape(m1*m2,-1)

In [0]:
def top_k_similiar(k, test_descr_vec, test_vecs, test_labels, actual_label):
    vecs = [np.concatenate((test_descr_vec, test_vecs[i])) for i in range(test_vecs.shape[0])]
    #vecs = concatenate_per_row(np.array([test_descr_vec]), res_net_comp_train)

    similarities = sim_model.predict(np.array(vecs))

    if k is None: k = test_labels.shape[0]

    sorted_indices = np.argsort(similarities)

    rank = test_vecs.shape[0]-np.where(np.array(test_labels)[sorted_indices] == actual_label)[0][0]
    predicted = np.array(test_labels)[sorted_indices[-k:]]
    return predicted, rank

In [0]:
predicted = []
actual = []
ranks = []

index_pool = list(range(descr_vec_val.shape[0]))

for i in random.sample(index_pool, 100):
  descr_vec, label = descr_vec_val[i], descr_labels_val[i]

  top_k, rank = top_k_similiar(100, descr_vec, res_net_comp_val, res_net_labels_val, label)
  predicted.append(label if label in top_k else top_k[0])
  actual.append(label)
  ranks.append(rank)

In [0]:
sklearn.metrics.accuracy_score(actual, predicted)

0.09

In [0]:
np.mean(ranks)

1123.15