In [None]:
import pandas as pd
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import time
%matplotlib inline

In [None]:
data = pd.read_csv('./datasets_recsys/favorite_stories_books_sample.csv', sep=';', encoding='latin')

In [None]:
data.head()

### Cantidad de veces que aparece como favorita cada historia (2 historias han sido favoriteadas 10 veces, 3 historias han sido favoriteadas 7 veces, etc,,,): 

In [None]:
story_counts = data['story_id'].value_counts().to_frame()
story_counts['id'] = story_counts.index
story_counts.columns = ['counts', 'story_id']
story_counts = story_counts[['story_id', 'counts']]

story_counts = story_counts.groupby(['counts']).size().reset_index(name='# stories')

story_counts = story_counts[['# stories', 'counts']]

story_counts.head(10)

In [None]:
# primeras 10 
story_data = story_counts.head(20)

story_plot = story_data.plot.bar(x='counts', y='frequency', color='orange', figsize=(20,10))

story_plot.set_ylabel('Number of favorites')
story_plot.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:,}'.format(y))) 

story_plot.set_xlabel('Number of stories')
#story_plot.set_title('Frequency distribution favorites per number of stories', fontsize=25)
legend = story_plot.legend([])
legend.remove()


### Cantidad de veces que un usuario le da favorito a una historia: (1 solo usuario tiene 5 historias favoritas, 9 usuarios tienen 4, etc...) 

In [None]:
user_counts = data['user_id'].value_counts().to_frame()
user_counts['id'] = user_counts.index
user_counts.columns = ['counts', 'user_id']
user_counts = user_counts[['user_id', 'counts']]

user_counts = user_counts.groupby(['counts']).size().reset_index(name='# users')

user_counts = user_counts[['# users', 'counts']]

user_counts

In [None]:

user_data = user_counts.head(100)

user_plot = user_data.plot.bar(x='counts', y='frequency', color='orange', figsize=(20,10))

user_plot.set_ylabel('Number of users')
user_plot.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:,}'.format(y))) 

user_plot.set_xlabel('Number of favorite stories')
#story_plot.set_title('Frequency distribution favorites per number of users', fontsize=25)
legend = user_plot.legend([])
legend.remove()

### Distribucion de franchises: 

In [None]:
print('users: {}'.format(len(list(data['user_id'].unique()))))
print('stories: {}'.format(len(list(data['story_id'].unique()))))
print('stories per user: {}'.format(len(list(data['story_id'].unique()))/len(list(data['user_id'].unique()))))
print('total interactions: {}'.format(len(list(data['story_id']))))


# Análisis de dependencia entre historias favoritas y usuarios.

In [None]:
stories = pd.read_csv('./datasets_recsys/Archivo/ff_stories_all_tab_delimited.csv', encoding='latin', sep='|')

In [None]:
users = pd.read_csv('./datasets_recsys/Archivo/ff_users_all_tab_delimited.csv', encoding='latin', sep='|')

In [None]:
users.head()

In [None]:
stories.head()

# Re-sample: users with more than 5 fav stories.


- __N__ = número de usuarios a samplear-
- __minQFavs__ = cantidad mínima de historias que deben tener los usuarios.

In [None]:
N = 15000
minQFavs = 5

In [None]:
data = pd.read_csv('./datasets_recsys/ff_favorite_stories_books.csv', sep=';', encoding='latin')

In [None]:
data.shape

Se cuentan la cantidad de historias favoritas de cada usuario:

In [None]:
q_favs_df = data.groupby('user_id').count().reset_index()\
                .rename(columns={'story_id': 'q_favs'}).iloc[:,:2]\
                .sort_values(by='q_favs', ascending=False)

In [None]:
q_favs_df.head()

Se seleccionan solo aquellos usuarios con más de minQFavs historias favoritas:

In [None]:
q_favs_sample = q_favs_df[q_favs_df['q_favs'] >= minQFavs]

In [None]:
q_favs_sample.shape

Se hace un sample de N usuarios sobre los usuarios con más de minQFavs historias favoritas:

In [None]:
q_favs_subsample = q_favs_sample.sample(n=N)

In [None]:
q_favs_subsample_sorted = q_favs_subsample.sort_values(by='q_favs', ascending=False)\
                                          .reset_index().iloc[:,1:]

In [None]:
q_favs_subsample_sorted.head()

Visualización del sample. Se puede ver que se comporta igual que el dataset original, por lo tanto, es representativo.

In [None]:
plt.scatter(q_favs_subsample_sorted.index, q_favs_subsample_sorted['q_favs'])

In [None]:
# Lista de usuarios sobre los que se obtendrán las historias favoritas.
usersID = q_favs_subsample['user_id']

In [None]:
# Liberación de algo de memoria (mi pc quedaba corto).

del q_favs_subsample
del q_favs_subsample_sorted
del q_favs_sample

In [None]:
# Se sacan de todas las historias favoritas solo aquellas pertenecientes a los N
# usuarios definidos anteriormente.

finalSample = data[data['user_id'].isin(usersID)]

In [None]:
finalSample.head()

In [None]:
finalSample.shape

In [None]:
102390 in usersID.tolist()

In [None]:
finalSample.to_csv('./datasets_recsys/favorite_stories_books_sample.csv', index=False, sep=';', encoding='latin')

# Análisis relación usuario/autorquesiguen/historiasfavs

In [None]:
authorBooks = pd.read_csv('./datasets_recsys/author_story_books.csv', sep=";", encoding='latin')

In [None]:
authorBooks.head()

In [None]:
authorBooks.shape

In [None]:
authorBooks.set_index(['author', 'story_id'], inplace=True)

In [None]:
authorBooks.head()

In [None]:
userAuthor = pd.read_csv('./datasets_recsys/ff_users_follow_authors.csv', sep=';', encoding='latin')

In [None]:
userAuthor.head()

In [None]:
userAuthor.shape

In [None]:
favsBooks = pd.read_csv('./datasets_recsys/favorite_stories_books_sample.csv', sep=';', encoding='latin')

In [None]:
favsBooks.head()

In [None]:
favsBooks.shape

In [None]:
usersID = favsBooks['user_id'].unique()

In [None]:
print('Cantidad de usuarios: {}'.format(usersID.size))

In [None]:
new_data = []
users_remaining = usersID.size
count = 0
t1 = time.time()
for userID in usersID:
    print('Quedan {} usuarios'.format(users_remaining))
    if users_remaining % 100 == 0 and count == 1:
        print('Quedan {} usuarios'.format(users_remaining))
        print(time.time() - t1)
        break
    favStories = favsBooks[favsBooks['user_id'] == userID]['story_id'].unique()
#     print('favs: {}'.format(favStories.size))
    authorsFollowed = userAuthor[userAuthor['Source'] == userID]['Target'].unique()
#     print('autores: {}'.format(authorsFollowed.size))
    q_followedByAuthor = 0
    q_complement = 0
    for story in favStories:
        followed = False
        for author in authorsFollowed:
            try:
                authorBooks.loc[author].loc[story]
                q_followedByAuthor += 1
                followed = True
                break
            except KeyError:
                pass
        if not followed:
            q_complement += 1
    users_remaining -= 1
    new_data.append([userID, favStories.size, authorsFollowed.size,\
                     q_followedByAuthor / (q_followedByAuthor + q_complement)])
#     print([userID, favStories.size, authorsFollowed.size,\
#                      q_followedByAuthor / (q_followedByAuthor + q_complement)])
#     print('time: {}'.format(tu2 - tu1))
    count = 1