In [38]:
# custom functions for this project
from functions import *

# dataframe libraries
import pandas as pd
import numpy as np

# graphing libraries
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

# miscellany
import re
from collections import Counter
import time
import gzip
import pickle

# reload functions/libraries when edited
%load_ext autoreload
%autoreload 2

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# increase column width of dataframe
pd.set_option('max_colwidth', 150)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
# import necessary dataframes
with gzip.open('data/poetry_all_genres_df.pkl', 'rb') as hello:
    df = pickle.load(hello)

In [15]:
# sort genres into umbrella genres
conditions = [
    df.genre == 'victorian', df.genre == 'romantic',
    df.genre == 'new_york_school', df.genre == 'new_york_school_2nd_generation', df.genre == 'confessional',
        df.genre == 'beat', df.genre == 'harlem_renaissance', df.genre == 'black_arts_movement',
    df.genre == 'imagist', df.genre == 'black_mountain', df.genre == 'language_poetry', df.genre == 'objectivist',
    df.genre == 'georgian', df.genre == 'fugitive'
]

# name umbrella genres, prepare 'other' to be dropped
choices = [
    'pre_1900', 'pre_1900',
    'metropolitan', 'metropolitan', 'metropolitan', 'metropolitan', 'metropolitan', 'metropolitan',
    'avant_garde', 'avant_garde', 'avant_garde', 'avant_garde',
    'other', 'other'
]

# convert data, leaving 'modern' untouched
df['umbrella_genre'] = np.select(conditions, choices, df.genre)

In [124]:
# take necessary (and potentially necessary columns)
df = df[['title', 'poet', 'genre', 'umbrella_genre', 'poem_url', 'poet_url', 'poem_string',
         'num_lines', 'num_words', 'avg_len_line', 'sentiment_polarity_score', 'sentiment_polarity',
         'sentiment_subjectivity_score', 'num_end_rhymes', 'end_rhyme_ratio', 'end_rhyme', 'avg_syllables_word']]

# drop poems in 'other' umbrella genre
df = df[df.umbrella_genre != 'other']

# reset the index
df.reset_index(drop=True, inplace=True)

In [8]:
from surprise import Dataset
from surprise import Reader

# This is the same data that was plotted for similarity earlier
# with one new user "E" who has rated only movie 1
ratings_dict = {
    "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
    "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
    "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3],
}

df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
# Loads the builtin Movielens-100k data
movielens = Dataset.load_builtin('ml-100k')

In [29]:
data.df

Unnamed: 0,user,item,rating
0,A,1,1.0
1,A,2,2.0
2,B,1,2.0
3,B,2,4.0
4,C,1,2.5
5,C,2,4.0
6,D,1,4.5
7,D,2,5.0
8,E,1,3.0


In [173]:
# create empty dataframe that we can populate with id info later
df_ids = pd.DataFrame(columns=['poem_id', 'poet_id', 'genre_id', 'umbrella_id'], index=range(0,4376))

In [174]:
# create empty dataframes for each variable that we can populate with ratings later
df_ratings_poem = pd.DataFrame(columns=['user_id', 'poem_id', 'rating'])
df_ratings_poet = pd.DataFrame(columns=['user_id', 'poet_id', 'rating'])
df_ratings_genre = pd.DataFrame(columns=['user_id', 'genre_id', 'rating'])
df_ratings_umbrella = pd.DataFrame(columns=['user_id', 'umbrella_id', 'rating'])

In [175]:
# create dictionaries with unique ids assigned to each unique variable
poet_ids = {poet: i+1 for i, poet in enumerate(df.poet.unique())}
genre_ids = {genre: i+1 for i, genre in enumerate(df.genre.unique())}
umbrella_ids = {umbrella: i+1 for i, umbrella in enumerate(df.umbrella_genre.unique())}

In [176]:
# populate ids dataframe, to be called upon later
for i, poem in df.iterrows():
    df_ids.loc[i, 'poem_id'] = i+1
    df_ids.loc[i, 'poet_id'] = poet_ids[poem.poet]
    df_ids.loc[i, 'genre_id'] = genre_ids[poem.genre]
    df_ids.loc[i, 'umbrella_id'] = umbrella_ids[poem.umbrella_genre]

# # set poem_id as the index
# df_ids.set_index('poem_id', drop=True, inplace=True)

In [168]:
df_ids.head()

Unnamed: 0,poem_id,poet_id,genre_id,umbrella_id
0,1,1,1,1
1,2,1,1,1
2,3,1,1,1
3,4,1,1,1
4,5,1,1,1


In [135]:
df_ratings_poem.columns[1]

'poem_id'

In [None]:
class User(object):
    
    def __init__(self):
        self.favorite_poems = []
        self.favorite_poets = []
        self.favorite_genres = []
        self.least_favorite_poems = []
        self.least_favorite_poets = []
        self.least_favorite_genres = []
        
    def generate_favorite(self, category):
        if category == 'genre':
            self.favorite_genres.append()
        
    def get_vaccinated(self, pct_vaccinated):
        if (1 - pct_vaccinated) < np.random.random():
            self.is_vaccinated = True

In [138]:
from numpy.random import randint

In [157]:
len(genre_ids)

13

In [None]:
row = [iname, ipassword, iemail]
df.loc[len(df)] = row

In [153]:
for i in range(10):
    print(randint(1,6))

5
1
3
5
2
1
4
1
3
2


In [159]:
len(df_ids['genre_id'].unique())

13

In [154]:
row = [1, randint(1, len(genre_ids)+1), randint(1,6)]
row

[1, 9, 4]

In [155]:
df_ratings_genre.loc[len(df_ratings_genre)] = row

df_ratings_genre

Unnamed: 0,user_id,genre_id,rating
0,1,9,4


In [179]:
def generate_user_ratings(num_users, ids_df, ratings_dfs):
    for i in range(num_users):
        user_id = i+1
        for df in ratings_dfs:
            target = df.columns[1]
            target_total = len(ids_df[target].unique())
            num_ratings = randint(1, ceil((target_total+1) / 4))
            for i in range(num_ratings):
                row = [user_id, randint(1, len(df_ids['genre_id'].unique())+1), randint(1,6)]
                df.loc[len(df)] = row

In [181]:
from math import ceil

In [182]:
ratings_dfs = [df_ratings_poem, df_ratings_poet, df_ratings_genre, df_ratings_umbrella]

generate_user_ratings(10, df_ids, ratings_dfs)

In [183]:
df_ratings_genre

Unnamed: 0,user_id,genre_id,rating
0,1,7,1
1,1,5,5
2,1,4,4
3,2,2,4
4,2,9,4
5,2,4,1
6,3,2,3
7,4,1,5
8,4,2,4
9,4,3,2


In [185]:
df_ratings_poet

Unnamed: 0,user_id,poet_id,rating
0,1,6,2
1,1,5,4
2,1,1,2
3,1,8,2
4,1,8,4
...,...,...,...
377,10,8,3
378,10,8,5
379,10,10,2
380,10,9,2


In [186]:
df_ratings_poem

Unnamed: 0,user_id,poem_id,rating
0,1,6,4
1,1,2,5
2,1,6,5
3,1,5,1
4,1,13,5
...,...,...,...
4858,10,7,2
4859,10,4,3
4860,10,1,3
4861,10,1,3


In [187]:
df_ratings_umbrella

Unnamed: 0,user_id,umbrella_id,rating
0,1,3,4
1,2,1,5
2,3,4,3
3,4,6,1
4,5,2,1
5,6,8,1
6,7,2,2
7,8,7,2
8,9,4,4
9,10,13,3
