i hate writing commit messages...

awarebayes · Aug 7, 2020 · 742fc34 · 742fc34
1 parent af0f05c
commit 742fc34
Show file tree

Hide file tree

Showing 7 changed files with 110 additions and 99 deletions.
diff --git a/examples/[Library Basics]/2. Different Pandas Backends.ipynb b/examples/[Library Basics]/2. Different Pandas Backends.ipynb
@@ -45,83 +45,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "dfdff54fdef24df59d3027095d099ec7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, max=20000263.0), HTML(value='')))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "00aaa1f880114a7ea1b5f00a3ce8a46c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, max=20000263.0), HTML(value='')))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "04fadc9450294367aed2aba5e5257bd5",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(FloatProgress(value=0.0, max=138493.0), HTML(value='')))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "\n",
-      "CPU times: user 57.4 s, sys: 2.15 s, total: 59.5 s\n",
-      "Wall time: 59.6 s\n"
-     ]
+     "name": "stderr",
+     "text": "100%|██████████| 20000263/20000263 [00:13<00:00, 1469488.15it/s]\n100%|██████████| 20000263/20000263 [00:15<00:00, 1265183.17it/s]\n100%|██████████| 138493/138493 [00:06<00:00, 19935.53it/s]\nCPU times: user 41.6 s, sys: 1.89 s, total: 43.5 s\nWall time: 43.5 s\n"
     },
     {
+     "output_type": "execute_result",
      "data": {
-      "text/plain": [
-       "<recnn.data.env.FrameEnv at 0x7fa50c5a1490>"
-      ]
+      "text/plain": "<recnn.data.env.FrameEnv at 0x7f28bd9fe7c0>"
      },
-     "execution_count": 4,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 3
     }
    ],
    "source": [
@@ -160,19 +100,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
-     "text": [
-      "2020-08-07 21:24:00,759\tINFO resource_spec.py:204 -- Starting Ray with 7.62 GiB memory available for workers and up to 3.81 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).\n",
-      "2020-08-07 21:24:01,002\tWARNING services.py:923 -- Redis failed to start, retrying now.\n",
-      "2020-08-07 21:24:01,223\tINFO services.py:1163 -- View the Ray dashboard at \u001b[1m\u001b[32mlocalhost:8265\u001b[39m\u001b[22m\n"
-     ]
+     "name": "stderr",
+     "text": "2020-08-07 23:31:23,301\tINFO resource_spec.py:204 -- Starting Ray with 6.88 GiB memory available for workers and up to 3.46 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).\n2020-08-07 23:31:23,522\tWARNING services.py:923 -- Redis failed to start, retrying now.\n2020-08-07 23:31:23,742\tINFO services.py:1163 -- View the Ray dashboard at \u001b[1m\u001b[32mlocalhost:8265\u001b[39m\u001b[22m\n"
     }
    ],
    "source": [
@@ -188,29 +124,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
-     "text": [
-      "100%|██████████| 138493/138493 [00:07<00:00, 18951.36it/s]\n",
-      "CPU times: user 12.1 s, sys: 1.4 s, total: 13.5 s\n",
-      "Wall time: 20.8 s\n"
-     ]
+     "name": "stderr",
+     "text": "100%|██████████| 138493/138493 [00:07<00:00, 19253.50it/s]\nCPU times: user 12.1 s, sys: 1.37 s, total: 13.5 s\nWall time: 21.5 s\n"
     },
     {
+     "output_type": "execute_result",
      "data": {
-      "text/plain": [
-       "<recnn.data.env.FrameEnv at 0x7fafa57f8cd0>"
-      ]
+      "text/plain": "<recnn.data.env.FrameEnv at 0x7f221581cbe0>"
      },
-     "execution_count": 5,
      "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 4
     }
    ],
    "source": [
@@ -306,9 +236,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.5-final"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/examples/[Library Basics]/temp.ipynb b/examples/[Library Basics]/temp.ipynb
@@ -0,0 +1,72 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python38564bitfba12b29602d49fd94d253df959599f4",
+   "display_name": "Python 3.8.5 64-bit"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# == recnn ==\n",
+    "import sys\n",
+    "sys.path.append(\"../../\")\n",
+    "import recnn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# but you can also set it directly:\n",
+    "recnn.pd.set(\"pandas\")\n",
+    "frame_size = 10\n",
+    "batch_size = 25"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "'/tmp'"
+     },
+     "metadata": {},
+     "execution_count": 5
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}
diff --git a/recnn/__init__.py b/recnn/__init__.py
@@ -1 +1,2 @@
 from recnn import data, utils, nn, rep
+from recnn.data import pd
diff --git a/recnn/data/__init__.py b/recnn/data/__init__.py
@@ -2,3 +2,4 @@
 from .utils import *
 from .env import *
 from .dataset_functions import *
+from .pandas_backend import pd
diff --git a/recnn/data/dataset_functions.py b/recnn/data/dataset_functions.py
@@ -1,5 +1,5 @@
 from recnn.data.utils import make_items_tensor
-
+from .pandas_backend import pd
 """     
     What?
     +++++
@@ -41,6 +41,11 @@ def prepare_dataset(**kwargs):
     build_data_pipeline function, and it is passed down the function chain. If needed, it will be used. Otherwise, ignored  
 """
 
+def try_progress_apply(dataframe, function):
+    try:
+        return dataframe.progress_apply(function)
+    except AttributeError:
+        return dataframe.apply(function)
 
 def prepare_dataset(df, key_to_id, frame_size, env, sort_users=False, **kwargs):
 
@@ -49,14 +54,17 @@ def prepare_dataset(df, key_to_id, frame_size, env, sort_users=False, **kwargs):
         [1, 34, 123, 2000], recnn makes it look like [0,1,2,3] for you.
     """
 
-    df['rating'] = df['rating'].progress_apply(lambda i: 2 * (i - 2.5))
-    df['movieId'] = df['movieId'].progress_apply(lambda i: key_to_id.get(i))
+    df['rating'] = try_progress_apply(df['rating'], lambda i: 2 * (i - 2.5))
+    df['movieId'] = try_progress_apply(df['movieId'], lambda i: key_to_id.get(i))
 
     users = df[['userId', 'movieId']].groupby(['userId']).size()
     users = users[users > frame_size]
     if sort_users:
         users = users.sort_values(ascending=False)
     users = users.index
+
+    if pd.get_type() == "modin":
+        df = df._to_pandas()
     ratings = df.sort_values(by='timestamp').set_index('userId').drop('timestamp', axis=1).groupby('userId')
 
     # Groupby user
@@ -68,7 +76,7 @@ def app(x):
         user_dict[int(userid)]['items'] = x['movieId'].values
         user_dict[int(userid)]['ratings'] = x['rating'].values
 
-    ratings.progress_apply(app)
+    try_progress_apply(ratings, app)
 
     env.user_dict = user_dict
     env.users = users

diff --git a/recnn/data/env.py b/recnn/data/env.py
@@ -1,6 +1,6 @@
 from . import utils, dataset_functions as dset_F
+from .pandas_backend import pd
 import pickle
-import pandas as pd
 from torch.utils.data import Dataset, DataLoader
 import torch
 import numpy as np
@@ -101,7 +101,7 @@ def __init__(self, embeddings, ratings, test_size=0.05, min_seq_size=10,
         self.embeddings = movies_embeddings_tensor
         self.key_to_id = key_to_id
         self.id_to_key = id_to_key
-        self.ratings = pd.read_csv(ratings)
+        self.ratings = pd.get().read_csv(ratings)
 
         self.user_dict = None
         self.users = None  # filtered keys of user_dict

diff --git a/recnn/data/utils.py b/recnn/data/utils.py
@@ -1,7 +1,7 @@
 
 import numpy as np
 import torch
-import pandas as pd
+from .pandas_backend import pd
 
 # helper function similar to pandas.Series.rolling
 def rolling_window(a, window):
@@ -112,8 +112,7 @@ def padder(x):
 
 
 def sort_users_itemwise(user_dict, users):
-    return pd.Series(dict([(i, user_dict[i]['items'].shape[0]) for i in users])).sort_values(ascending=False).index
-
+    return pd.get().Series(dict([(i, user_dict[i]['items'].shape[0]) for i in users])).sort_values(ascending=False).index
 
 def prepare_batch_dynamic_size(batch, item_embeddings_tensor, embed_batch=None):
     item_idx, ratings_t, sizes_t, users_t = get_irsu(batch)