In [1]:
! pip install lightfm > /dev/null

In [2]:
import pandas as pd
import lightfm as lfm
from lightfm import data
from lightfm import cross_validation
from lightfm import evaluation



In [3]:
df_int = pd.read_csv("data/movies_lists_data/all_movies_lists_interactions.csv")
df_int.head()

Unnamed: 0.1,Unnamed: 0,list_id,movie_link,movie_stars,movie_calification,movie_position
0,0,5679698,/film/do-the-right-thing/,★★★★★,5.0,1.0
1,1,5679698,/film/singin-in-the-rain/,★★★★★,5.0,2.0
2,2,5679698,/film/nashville/,★★★★★,5.0,3.0
3,3,5679698,/film/the-wizard-of-oz-1939/,★★★★★,5.0,4.0
4,4,5679698,/film/the-apartment/,★★★★★,5.0,5.0


In [4]:
ds = lfm.data.Dataset()
ds.fit(users=df_int["list_id"].unique(), items=df_int["movie_link"].unique())
ds.interactions_shape()

(332406, 650368)

In [5]:
(interactions, weights) = ds.build_interactions(df_int[["list_id", "movie_link", "movie_calification"]].itertuples(index=False))

In [6]:
(train, test) = lfm.cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=42)
(train_w, test_w) = lfm.cross_validation.random_train_test_split(weights, test_percentage=0.2, random_state=42)


model = lfm.LightFM(no_components=10, k=5, n=10, learning_schedule='adagrad', loss='logistic', learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=42)
model.fit(train, sample_weight=train_w, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x2abaade10>

In [7]:
train_precision = lfm.evaluation.precision_at_k(model, train, k=10, num_threads=2)
test_precision = lfm.evaluation.precision_at_k(model, test, k=10, num_threads=2)

print("Precision@10 en training:", train_precision.mean())
print("Precision@10 en testing:", test_precision.mean())

# Item features

In [None]:
df_items = pd.read_csv("libros.csv", usecols=["id_libro", "autor", "genero"])
df_items.head()

Unnamed: 0,id_libro,autor,genero
0,las-particulas-elementales,"HOUELLEBECQ, MICHEL",Narrativa
1,quien-domina-el-mundo,"CHOMSKY, NOAM",Ensayo
2,antimanual-de-filosofia,"ONFRAY, MICHEL",Ensayo
3,momentos-estelares-de-la-humanidad-catorce-min...,"ZWEIG, STEFAN",Histórica y aventuras
4,la-invencion-del-pasado,"MURADO, MIGUEL-ANXO",Lecturas complementarias


In [None]:
ds = lfm.data.Dataset()
item_features = df_items["autor"].unique().tolist() + df_items["genero"].unique().tolist()
ds.fit(users=df_int["id_lector"].unique(), items=df_int["id_libro"].unique(), item_features=item_features)
len(item_features)

9628

In [None]:
ifs = []

for index, row in df_items.iterrows():
    ifs.append( (row["id_libro"], (row["autor"], row["genero"]))  )
    #ifs.append( (row["id_libro"], {row["autor"]:0.1, row["genero"]: 0.9} )  )

item_features = ds.build_item_features(ifs)
item_features

<22269x31896 sparse matrix of type '<class 'numpy.float32'>'
	with 66770 stored elements in Compressed Sparse Row format>

In [None]:
(interactions, weights) = ds.build_interactions(df_int[["id_lector", "id_libro", "rating"]].itertuples(index=False))

In [None]:
(train, test) = lfm.cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=42)
(train_w, test_w) = lfm.cross_validation.random_train_test_split(weights, test_percentage=0.2, random_state=42)

model = lfm.LightFM(no_components=10, k=5, n=10, learning_schedule='adagrad', loss='logistic', learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=42)
model.fit(train, sample_weight=train_w, item_features=item_features, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7dc91d42b670>

In [None]:
train_precision = lfm.evaluation.precision_at_k(model, train, item_features=item_features, k=10, num_threads=2)
test_precision  = lfm.evaluation.precision_at_k(model, test,  item_features=item_features, k=10, num_threads=2)

print("Precision@10 en training:", train_precision.mean())
print("Precision@10 en testing:", test_precision.mean())

Precision@10 en training: 0.19017199
Precision@10 en testing: 0.049333338


# Predicción

In [None]:
user_id_map, user_feature_map, item_id_map, item_feature_map = ds.mapping()
user_id_map

In [None]:
model = lfm.LightFM(no_components=20, random_state=42)
model.fit(interactions, sample_weight=weights, item_features=item_features, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7dc91d363ac0>

In [None]:
id_lector = "popocito"
id_libros = ["yerma", "el-hobbit", "el-silmarillion"]

model.predict(user_id_map[id_lector], [item_id_map[l] for l in id_libros], item_features=item_features, num_threads=2)

array([ 7.5560455, 10.598244 , 10.365233 ], dtype=float32)

In [None]:
libros_leidos = df_int.loc[df_int["id_lector"] == id_lector, "id_libro"].tolist()
todos_los_libros = df_items["id_libro"].tolist()

libros_no_leidos = set(todos_los_libros).difference(libros_leidos)
predicciones = model.predict(user_id_map[id_lector], [item_id_map[l] for l in libros_no_leidos], item_features=item_features, num_threads=2)

sorted([(p, l) for (p, l) in zip(predicciones, libros_no_leidos)], reverse=True)[:10]
# for libro in unique():
#     df_int[""]

[(15.678148, 'stoner'),
 (15.670301, 'la-edad-de-la-ira'),
 (14.598097, 'africanus-el-hijo-del-consul'),
 (13.218816, 'el-nino-con-el-pijama-de-rayas'),
 (12.853111, 'reina-roja'),
 (12.831611, 'la-novia-gitana'),
 (12.807747, 'la-red-purpura'),
 (12.798654, 'el-silencio-de-la-ciudad-blanca-trilogia-de-la-ciudad-blanca-1'),
 (12.787656, 'el-psicoanalista'),
 (12.779596, 'el-paciente')]