In [2]:
import datetime
import numpy as np
import os
import pandas as pd
import shutil
import urllib.request
import zipfile


__all__ = [
    'fetch_ml_ratings',
]

VARIANTS = {
    '100k': {'filename': 'u.data', 'sep': '\t'},
    '1m': {'filename': 'ratings.dat', 'sep': r'::'},
    '10m': {'filename': 'ratings.dat', 'sep': r'::'},
    '20m': {'filename': 'ratings.csv', 'sep': ','}
}
def fetch_ml_ratings(data_dir_path=None, variant='20m', verbose=False):
    """Fetches MovieLens ratings dataset.

    Parameters
    ----------
    data_dir_path : str, default=None
        Explicit data directory path to MovieLens ratings file.
    variant : {'100k', '1m', '10m', '20m'}, default='20m'
        Movie lens dataset variant.
    verbose : bool, default=False
        Whether or not downloading and unzipping the dataset with verbose.

    Returns
    -------
    df : pandas.DataFrame
        The MovieLens ratings dataset.
    """
    if data_dir_path is None:
        data_dir_path = _get_data_dir_path(data_dir_path)
        dirname = 'ml-' + variant
        filename = VARIANTS[variant]['filename']
        csv_path = os.path.join(data_dir_path, dirname, filename)
        zip_path = os.path.join(data_dir_path, dirname) + '.zip'
        url = 'http://files.grouplens.org/datasets/movielens/ml-' + variant + \
              '.zip'
    else:
        csv_path = data_dir_path

    if os.path.exists(csv_path):
        # Return data loaded into a DataFrame
        df = _ml_ratings_csv_to_df(csv_path, variant)
        return df

    elif os.path.exists(zip_path):
        # Unzip file before calling back itself
        if verbose:
            print('Unzipping data...')

        with zipfile.ZipFile(zip_path, 'r') as zf:
            zf.extractall(data_dir_path)

        if variant == '10m':
            os.rename(os.path.join(data_dir_path, 'ml-10M100K'),
                      os.path.join(data_dir_path, dirname))

        os.remove(zip_path)

        return fetch_ml_ratings(variant=variant, verbose=verbose)

    else:
        # Download the ZIP file before calling back itself
        if verbose:
            print('Downloading data...')

        with urllib.request.urlopen(url) as r, open(zip_path, 'wb') as f:
            shutil.copyfileobj(r, f)

        return fetch_ml_ratings(variant=variant, verbose=verbose)


def _get_data_dir_path(data_dir_path=None):
    """Returns the path of the funk-svd data directory.

    This folder is used to store large datasets to avoid downloading them
    several times.
    By default the data dir is set to a folder named 'funk_svd_data' in the
    user home folder. Alternatively, it can be set by the `FUNK_SVD_DATA`
    environment variable or programmatically by giving an explicit
    `data_dir_path`.
    If the folder does not already exist, it is automatically created.

    Parameters
    ----------
    data_dir_path : str, default=None
        Explicit data directory path for large datasets.

    Returns
    -------
    data_dir_path: str
        Explicit data directory path for large datasets.

    """
    if data_dir_path is None:
        default = os.path.join('~', 'funk_svd_data')
        data_dir_path = os.environ.get('FUNK_SVD_DATA', default=default)
        data_dir_path = os.path.expanduser(data_dir_path)

    if not os.path.exists(data_dir_path):
        os.makedirs(data_dir_path)

    return data_dir_path


def _ml_ratings_csv_to_df(csv_path, variant):
    names = ['u_id', 'i_id', 'rating', 'timestamp']
    dtype = {'u_id': np.uint32, 'i_id': np.uint32, 'rating': np.float64}

    def date_parser(time):
        return datetime.datetime.fromtimestamp(float(time))

    df = pd.read_csv(csv_path, names=names, dtype=dtype, header=0,
                     sep=VARIANTS[variant]['sep'], parse_dates=['timestamp'],
                     date_parser=date_parser, engine='python')

    df.sort_values(by='timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [68]:
df = fetch_ml_ratings(variant="100k")
train = df.sample(frac=0.8, random_state=7)

item_ids = train["i_id"].unique().tolist()
user_ids = train["u_id"].unique().tolist()

user_n = len(user_ids)
item_n = len(item_ids)

user_idx = range(user_n)
item_idx = range(item_n)

user_mapping = dict(zip(user_ids, user_idx))
item_mapping = dict(zip(item_ids, item_idx))

train["u_id"] = train["u_id"].map(user_mapping)
train["i_id"] = train["i_id"].map(item_mapping)

train["u_id"] = train["u_id"].astype(np.int32)
train["i_id"] = train["i_id"].astype(np.int32)
train.fillna(-1, inplace=True)
print(train.dtypes)
np_stuff = train[["i_id", "u_id", "rating"]].values
np_stuff


  df = pd.read_csv(csv_path, names=names, dtype=dtype, header=0,


u_id                  int32
i_id                  int32
rating              float64
timestamp    datetime64[ns]
dtype: object


dtype('float64')

In [76]:
F = 100

E = np.random.normal(0, .1, (user_n, F))
A = np.random.normal(0, .1, (F, item_n))
E.shape, A.shape

d = E @ A
d.shape

error = np.zeros(d.shape, dtype=np.float32)
print(np_stuff.shape)
for i in range (np_stuff.shape[0]):
    x, y = int(np_stuff[i, 0]), int(np_stuff[i, 1])
    le = (np_stuff[i,2] - d[x, y])**2
    for j in range(F):
        E[x, j] += 0.0001
    print(le)
    




(79999, 3)
4.035378273836342
24.518273663987834
17.987620117104093
15.655149049646715
17.061969535822023
17.286454584563366
3.6928887258552257
1.0191517253725548
17.095138245814567
0.7344106969243462
15.597761299888026
16.41145058999452
24.545633370181307
8.856476672336402
9.02609937388855
16.264111871593585
17.24751701862483
10.018701115067104
15.84121556589349
4.083234423925735
24.658895782024405
26.925636929150105
4.558714286088849
15.398370889180633
0.7608700981866439
9.299034253181128
16.1387868516532
8.590845218175964
9.192704826460014
15.149915691565823
14.908929204860732
8.566733573382278
7.961815608066497
15.10012316648499
8.931608912571582
3.88985921393833
15.670226982893297
15.53696601993761
16.152288029421978
15.49280995805484
8.428822029923088
1.2167700876122336
23.512417446192295
14.946644362535737
9.03027114816085
15.893465406606007
16.82264274851476
16.809850551100656
24.614747417245038
18.19412011017033
3.6363272116720995
24.856991347134944
24.90685397756854
0.80213842

IndexError: index 943 is out of bounds for axis 0 with size 943