In [167]:
import numpy as np
import sklearn
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [168]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
base_path = "drive/My Drive/AML/Final/data/"

In [0]:
s = "The skateboarder is putting on a show using the picnic table as his stage. An apple, picnic."

In [0]:
def noun_phrases(text):
  blob = TextBlob(text)
  nouns = [noun[0] for noun in filter(lambda x:x[1]=='NN', blob.tags)]
  noun_phrases = blob.noun_phrases

  return set(nouns+noun_phrases)

In [172]:
noun_phrases(s)

{'apple', 'picnic', 'picnic table', 'show', 'skateboarder', 'stage', 'table'}

In [0]:
train_description_file_contents = np.load(base_path+"np_descriptions_train.npy")
test_description_file_contents = np.load(base_path+"np_descriptions_test.npy")

train_tags_file_contents = np.load(base_path+"np_tags_train.npy")
test_tags_file_contents = np.load(base_path+"np_tags_test.npy")

In [174]:
train_tags_file_contents[0]

'vehicle:airplane\noutdoor:bench\nsports:skateboard\nperson:person\nvehicle:truck\naccessory:backpack\naccessory:handbag\nfurniture:dining table\n'

In [0]:
def process_descriptions(descriptions_file_contents):
  comp_descriptions = []

  for i, description_set in enumerate(descriptions_file_contents): 
      compound_description = description_set.replace("\n", " ")
      nouns = noun_phrases(compound_description)

      noun_only_comp_description = [word for word in compound_description.split(" ") if word in nouns]
      comp_descriptions.append(" ".join(noun_only_comp_description))

  vectorizer = TfidfVectorizer(stop_words="english")
  freq = vectorizer.fit_transform(comp_descriptions)

  return freq, vectorizer

In [0]:
def process_indiv_descriptions(descriptions_file_contents):
  descriptions = []
  labels = []

  for i, description_set in enumerate(descriptions_file_contents): 
    for description in description_set.split("\n"):
      nouns = noun_phrases(description)

      noun_only_description = [word for word in description.split(" ") if word in nouns]
      descriptions.append(" ".join(noun_only_description))
      labels.append(i)

  vectorizer = TfidfVectorizer(stop_words="english")
  freq = vectorizer.fit_transform(descriptions)

  return freq.toarray(), descriptions, labels, vectorizer

In [0]:
def process_tags(tags_file_contents):
  tags = []

  for i, tag_set in enumerate(tags_file_contents):
    compound_tag = [tag.split(":")[1] if ":" in tag else "" for tag in tag_set.split("\n")]
    tags.append(" ".join(compound_tag))

  vectorizer = CountVectorizer()
  counts = vectorizer.fit_transform(tags)
  return counts, tags, vectorizer

In [0]:
def similarity_score(s1, s2):
  return len(set(s1.split(" ")).intersection(set(s2.split(" "))))

In [0]:
desc_bagw_train, descriptions_train, desc_labels_train, desc_vectorizer_train = process_indiv_descriptions(train_description_file_contents)

tags_bagw_train, tags_train, tags_vectorizer_train = process_tags(train_tags_file_contents)

In [0]:
def load_res_net(path):
  with open(base_path+path, "r") as f:  
    train = []
    labels = []

    for line in f.readlines():
      comps = line.split(",")
      train.append(comps[1:])
      labels.append(int(comps[0].split("/")[1].split(".")[0]))

    train = np.array(train, dtype=float)
    labels = np.array(labels, dtype=int)

    return train, labels

In [0]:
res_net_train, res_net_labels_train = load_res_net("features_train/features_resnet1000_train.csv")
res_net_test, res_net_labels_test = load_res_net("features_test/features_resnet1000_test.csv")

In [182]:
res_net_labels_train[0]

5373

In [0]:
mapping_X_train = []
mapping_Y_train = desc_bagw_train

for label in desc_labels_train:
  res_net_idx = np.argwhere(res_net_labels_train == label)[0][0]
  mapping_X_train.append(res_net_train[res_net_idx])

mapping_X_train = np.array(mapping_X_train)

In [0]:
# from sklearn.ensemble import RandomForestRegressor

# ranforest = RandomForestRegressor(n_estimators=500, max_depth=18)
# ranforest.fit(mapping_X_train, mapping_Y_train)

In [0]:
from keras.models import Sequential
from keras.layers import Dense

mapping_model = Sequential()
mapping_model.add(Dense(128, input_dim=mapping_X_train.shape[1], activation='sigmoid'))
mapping_model.add(Dense(128, activation='sigmoid'))
mapping_model.add(Dense(64, activation='sigmoid'))
mapping_model.add(Dense(mapping_Y_train.shape[1], activation='sigmoid'))

mapping_model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [0]:
#mapping_model.fit(mapping_X_train, mapping_Y_train, epochs=1, batch_size=10)

In [0]:
def top_k_similiar(k, mapped_vecs, mapped_vecs_labels, test_vec):
    distances = np.linalg.norm(mapped_vecs-test_vec)
    sorted_indices = np.argsort(distances)
    predicted = mapped_vecs_labels[sorted_indices[:k]]
    return predicted

In [0]:
desc_bagw_test, descriptions_test, desc_labels_test, _ = process_indiv_descriptions(test_description_file_contents)
#desc_bagw_test = desc_vectorizer_train.transform(descriptions_test)

In [0]:
mapped_test_vecs = desc_bagw_test #mapping_model.predict(res_net_test)

In [230]:
len(mapped_test_vecs[0])

1931

In [231]:
desc_bagw_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [232]:
predicted = []
actual = []

for description_vec, label in random.sample(list(zip(desc_bagw_test, desc_labels_test)), 100):
  top_k = top_k_similiar(30, mapped_test_vecs, np.array(desc_labels_test), description_vec)
  print(label in top_k)
  predicted.append(label if label in top_k else top_k[0])
  actual.append(label)

False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


KeyboardInterrupt: ignored

In [218]:
sklearn.metrics.accuracy_score(actual, predicted)

0.0