-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from adiIspas/lightfm-version
Lightfm version
- Loading branch information
Showing
28 changed files
with
20,773 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os | ||
import pandas as pd | ||
import csv | ||
import shutil | ||
|
||
dataset = '../../../king-rec-dataset/ml-latest-small/images/' | ||
number_of_clusters = 7 | ||
model = 'vgg16' | ||
clusters_dir = '../../../king-rec-dataset/ml-latest-small/results/clusters/sanity-check/' + model + '/' + str(number_of_clusters) + '/' | ||
|
||
|
||
def collect_posters(): | ||
data = pd.read_csv('sanity_check_movies_1_poster_clusters_' + model + '.csv') | ||
|
||
# create directories | ||
for idx in range(1, number_of_clusters + 1): | ||
os.makedirs(clusters_dir + str(idx), exist_ok=True) | ||
|
||
# move posters into associated cluster | ||
for index, row in data.iterrows(): | ||
src = dataset + str(int(row['0'])) + '/posters/' + str(int(row['1'])) + '.jpg' | ||
dest = clusters_dir + str(int(row['cluster_' + str(number_of_clusters)]) + 1) + '/' + str(int(row['0'])) + '_' + str(int(row['1'])) + '.jpg' | ||
|
||
if os.path.isfile(src): | ||
shutil.copy(src, dest) | ||
|
||
print('Done') | ||
|
||
|
||
collect_posters() | ||
|
||
dataset2 = '../../../king-rec-dataset/ml-latest-small/' | ||
|
||
|
||
def get_items_ids(): | ||
item_ids = set() | ||
|
||
with open(dataset2 + 'movies.csv', 'r') as movies_file: | ||
reader = csv.reader(movies_file, delimiter=',') | ||
next(reader) # skip header | ||
|
||
for row in reader: | ||
item_ids.add(int(row[0])) | ||
|
||
return item_ids | ||
|
||
|
||
def count_movies(): | ||
movies = get_items_ids() | ||
|
||
idx = 1 | ||
for item in movies: | ||
print(idx, item) | ||
idx = idx + 1 | ||
|
||
|
||
# count_movies() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import csv | ||
import pandas as pd | ||
|
||
from sklearn import metrics | ||
from sklearn.cluster import KMeans | ||
|
||
|
||
dataset = '../../../king-rec-dataset/ml-latest-small/' | ||
|
||
|
||
def get_items_ids(): | ||
item_ids = set() | ||
|
||
with open(dataset + 'movies.csv', 'r') as movies_file: | ||
reader = csv.reader(movies_file, delimiter=',') | ||
next(reader) # skip header | ||
|
||
for row in reader: | ||
item_ids.add(int(row[0])) | ||
|
||
return item_ids | ||
|
||
|
||
def explore_clusters(): | ||
clusters = range(2, 22, 2) | ||
models_results = dict() | ||
colors = ['r', 'y', 'b', 'g', 'c'] | ||
|
||
models = ['vgg16', 'vgg19', 'inception_v3', 'resnet50', 'NASNet'] | ||
|
||
for model in models: | ||
print('Reading data ...') | ||
feature_list = np.loadtxt('./posters_features/1000-movies/' + model + '1000-movies_1-posters.csv', delimiter=',') | ||
print('Complete read data.') | ||
|
||
movie_poster_clusters = pd.DataFrame(feature_list[:, :2]) | ||
|
||
feature_list = feature_list[:, 2:] | ||
feature_list_np = np.array(feature_list) | ||
for n_clusters in clusters: | ||
k_means = KMeans(n_clusters=n_clusters).fit(feature_list_np) | ||
|
||
name = model | ||
result = metrics.silhouette_score(feature_list_np, k_means.labels_) | ||
|
||
if name not in models_results: | ||
results = [] | ||
else: | ||
results = models_results.pop(name) | ||
|
||
cluster_name = 'cluster_' + str(n_clusters) | ||
movie_poster_clusters[cluster_name] = pd.Series(k_means.labels_) | ||
|
||
results.append(result) | ||
models_results.update({name: results}) | ||
print('silhouette score on', name, 'with', n_clusters, 'clusters:', result) | ||
|
||
movie_poster_clusters.to_csv('movies_1_poster_clusters_' + name + '.csv') | ||
|
||
n_groups = len(list(clusters)) | ||
index = np.arange(n_groups) | ||
bar_width = 0.15 | ||
current_index = 0 | ||
|
||
for key, values in models_results.items(): | ||
plt.bar(index + bar_width * current_index, values, bar_width, | ||
color=colors[current_index], | ||
label=key) | ||
current_index += 1 | ||
|
||
plt.xlabel('Number of clusters') | ||
plt.ylabel('Silhouette score') | ||
plt.title('Silhouette score by model') | ||
plt.xticks(index + bar_width, list(clusters)) | ||
plt.legend() | ||
plt.tight_layout() | ||
plt.savefig('silhouette-score.jpg') | ||
plt.show() | ||
|
||
|
||
def main(): | ||
explore_clusters() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import csv | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.cluster import MiniBatchKMeans | ||
|
||
|
||
dataset = '../../../king-rec-dataset/ml-latest-small/' | ||
|
||
|
||
def get_items_ids(): | ||
item_ids = set() | ||
|
||
with open(dataset + 'movies.csv', 'r') as movies_file: | ||
reader = csv.reader(movies_file, delimiter=',') | ||
next(reader) # skip header | ||
|
||
for row in reader: | ||
item_ids.add(int(row[0])) | ||
|
||
return item_ids | ||
|
||
|
||
def explore_clusters(): | ||
batch_size = 40 | ||
|
||
# models = ['vgg16', 'vgg19', 'inception_v3', 'resnet50', 'NASNet'] | ||
models = ['resnet50'] | ||
|
||
for model in models: | ||
# csv_path = './' + model + '-1-posters.csv' | ||
csv_path = './posters_features/sanity-check/' + model + '-sanity-check.csv' | ||
|
||
movie_poster_clusters = pd.DataFrame() | ||
for n_clusters in [7]: | ||
final_clusters = pd.Series() | ||
print('Process cluster', n_clusters) | ||
|
||
k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, compute_labels=True) | ||
|
||
reader_chunks = pd.read_csv(csv_path, delimiter=',', header=None, chunksize=batch_size) | ||
for chunk in reader_chunks: | ||
print('Processing chunk ...') | ||
|
||
feature_list = pd.DataFrame(data=chunk) | ||
|
||
movie_poster_clusters = movie_poster_clusters.append(feature_list.iloc[:, :2]) | ||
|
||
feature_list = feature_list.iloc[:, 2:] | ||
feature_list_np = np.array(feature_list) | ||
|
||
k_means.partial_fit(feature_list_np) | ||
|
||
reader_chunks = pd.read_csv(csv_path, delimiter=',', header=None, chunksize=batch_size) | ||
for chunk in reader_chunks: | ||
print('Predicting chunk ...') | ||
|
||
feature_list = pd.DataFrame(data=chunk) | ||
|
||
feature_list = feature_list.iloc[:, 2:] | ||
feature_list_np = np.array(feature_list) | ||
|
||
labels = k_means.predict(feature_list_np) | ||
|
||
final_clusters = final_clusters.append(pd.Series(labels)) | ||
|
||
name = model | ||
|
||
cluster_name = 'cluster_' + str(n_clusters) | ||
movie_poster_clusters[cluster_name] = pd.Series(final_clusters.values, index=movie_poster_clusters.index) | ||
|
||
movie_poster_clusters.to_csv('test-chunk-movies_1_poster_clusters_' + name + '.csv') | ||
|
||
|
||
def main(): | ||
explore_clusters() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import os | ||
import csv | ||
import sys | ||
import requests | ||
import urllib.request | ||
|
||
api_key = sys.argv[1] | ||
|
||
dataset = '../../king-rec-dataset/ml-latest-small/' | ||
tmdb_api = 'https://api.themoviedb.org/3/movie/$MOVIE_ID/images?include_image_language=en,null&api_key=$API_KEY' | ||
tmdb_images_url = 'https://image.tmdb.org/t/p/original/' | ||
|
||
|
||
def get_tmdb_posters(tmdb_api_key, max_movie_index=10): | ||
tmdb_movies_id = get_tmdb_ids() | ||
download_images(tmdb_api_key, tmdb_movies_id, max_movie_index) | ||
|
||
return tmdb_movies_id | ||
|
||
|
||
def download_images(tmdb_api_key, tmdb_movies_id, max_movie_index=10): | ||
images = dataset + 'images/' | ||
|
||
movie_index = 1 | ||
total_movies = len(tmdb_movies_id.items()) | ||
|
||
for key, value in tmdb_movies_id.items(): | ||
posters = images + str(key) + '/posters/' | ||
backdrops = images + str(key) + '/backdrops/' | ||
|
||
if not os.path.exists(posters): | ||
os.makedirs(posters) | ||
|
||
if not os.path.exists(backdrops): | ||
os.makedirs(backdrops) | ||
|
||
if len(os.listdir(posters)) == 0 or len(os.listdir(backdrops)) == 0: | ||
current_url = tmdb_api.replace('$MOVIE_ID', str(value)).replace('$API_KEY', tmdb_api_key) | ||
response = requests.get(current_url) | ||
|
||
if response.status_code == 200: | ||
json = response.json() | ||
|
||
if len(os.listdir(posters)) == 0: | ||
image_idx = 1 | ||
for poster in json['posters']: | ||
if poster['iso_639_1'] == 'en': | ||
print(movie_index, '/', total_movies, '- Process movie', value, 'and poster', image_idx) | ||
poster_url = poster['file_path'] | ||
urllib.request.urlretrieve(tmdb_images_url + poster_url, posters + str(image_idx) + '.jpg') | ||
image_idx += 1 | ||
|
||
if len(os.listdir(backdrops)) == 0: | ||
image_idx = 1 | ||
for backdrop in json['backdrops']: | ||
if backdrop['iso_639_1'] == 'xx' or backdrop['iso_639_1'] is None: | ||
print(movie_index, '/', total_movies, '- Process movie', value, 'and backdrop', image_idx) | ||
backdrop_url = backdrop['file_path'] | ||
urllib.request.urlretrieve(tmdb_images_url + backdrop_url, | ||
backdrops + str(image_idx) + '.jpg') | ||
image_idx += 1 | ||
|
||
else: | ||
print('Status code:', response.status_code, 'on movie', key, '-', value) | ||
|
||
if movie_index == max_movie_index: | ||
break | ||
|
||
movie_index += 1 | ||
|
||
|
||
def get_tmdb_ids(tmdb_index=2): | ||
links = dataset + 'links.csv' | ||
with open(links, 'r') as links_file: | ||
reader = csv.reader(links_file, delimiter=',', ) | ||
next(reader) # skip header | ||
|
||
tmdb_movies_id = dict() | ||
for row in reader: | ||
tmdb_movies_id.update({row[0]: row[tmdb_index]}) | ||
|
||
return tmdb_movies_id | ||
|
||
|
||
get_tmdb_posters(api_key, max_movie_index=20) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import os | ||
import numpy as np | ||
import csv | ||
import pandas as pd | ||
|
||
from keras.layers import Input | ||
from keras.preprocessing import image | ||
from keras.applications.vgg16 import VGG16 | ||
from keras.applications.vgg19 import VGG19 | ||
from keras.applications.inception_v3 import InceptionV3 | ||
from keras.applications.resnet50 import ResNet50 | ||
from keras.applications.nasnet import NASNetLarge | ||
from keras.applications.imagenet_utils import preprocess_input | ||
|
||
dataset = '../../../king-rec-dataset/ml-latest-small/' | ||
base_path = 'images/' | ||
# base_path = 'clusters_sanity_check/' | ||
max_posters_per_movie = 1 | ||
|
||
|
||
def get_int(filename): | ||
return int(filename.split('.')[0]) | ||
|
||
|
||
def get_items_ids(): | ||
item_ids = set() | ||
|
||
with open(dataset + 'movies.csv', 'r') as movies_file: | ||
reader = csv.reader(movies_file, delimiter=',') | ||
next(reader) # skip header | ||
|
||
for row in reader: | ||
item_ids.add(int(row[0])) | ||
|
||
return item_ids | ||
|
||
|
||
def extract_images_features(): | ||
movies = list(get_items_ids()) | ||
# movies = [1, 3, 4, 5, 7, 19, 22, 23] | ||
subdir = [dataset + base_path + str(movie) + '/posters/' for movie in movies] | ||
models = [ | ||
VGG16(weights='imagenet', include_top=False), | ||
VGG19(weights='imagenet', include_top=False), | ||
InceptionV3(weights='imagenet', include_top=False), | ||
ResNet50(weights='imagenet', include_top=False), | ||
NASNetLarge(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3))) | ||
] | ||
total_movies = len(subdir) | ||
for current_movie, dirname in enumerate(subdir): | ||
movie_idx = int([s for s in dirname.split('/') if s.isdigit()][0]) | ||
filenames = sorted(os.listdir(dirname), key=get_int)[0:max_posters_per_movie] | ||
|
||
for _, file_name in enumerate(filenames): | ||
poster_idx = int(file_name.split('.')[0]) | ||
|
||
img = image.load_img(dirname + '/' + file_name, target_size=(224, 224)) | ||
img_data = image.img_to_array(img) | ||
img_data = np.expand_dims(img_data, axis=0) | ||
img_data = preprocess_input(img_data) | ||
|
||
for model in models: | ||
feature = model.predict(img_data) | ||
feature_np = np.array(feature) | ||
feature = feature_np.flatten() | ||
|
||
data_to_save = np.append([movie_idx, poster_idx], feature) | ||
data = pd.DataFrame([data_to_save]) | ||
data.to_csv(model.name + '-' + str(max_posters_per_movie) + '-posters' + '.csv', | ||
mode='a', sep=',', index=False, header=False) | ||
|
||
print(str(current_movie + 1) + '/' + str(total_movies) + ':', 'movie id:', movie_idx, ' poster id:', poster_idx, | ||
' model name:', model.name, ' total features:', len(feature)) | ||
|
||
|
||
def main(): | ||
extract_images_features() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.