# Setup

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd gdrive/MyDrive/News+LPReports/Graph/guide

/content/gdrive/MyDrive/News+LPReports/Graph/guide


- check pytorch and CUDA version
- install the relevant packages with those versions
- then install pytorch geometric

https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html

In [3]:
!python -c "import torch; print(torch.__version__)"

1.9.0+cu102


In [4]:
!python -c "import torch; print(torch.version.cuda)"

10.2


In [5]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html


In [17]:
import numpy as np
import pandas as pd
import pickle
import csv
import os
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Load data

In [7]:
np.random.seed(42)

In [8]:
df = pd.read_csv('yoochoose-clicks.dat',
            header=None, sep=',', engine='python', nrows=100000)

df.head()

Unnamed: 0,0,1,2,3
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


In [10]:
df.columns=['session_id','timestamp','item_id','category']
df.head(3)

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0


In [11]:
df['valid_session'] = df.session_id.map(df.groupby('session_id')['item_id'].size() > 2)
df = df.loc[df.valid_session].drop('valid_session',axis=1)

df.head(3)

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0


In [13]:
item_encoder = LabelEncoder()
df['item_id'] = item_encoder.fit_transform(df.item_id)
df.head(3)

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,622,0
1,1,2014-04-07T10:54:09.868Z,621,0
2,1,2014-04-07T10:54:46.998Z,623,0


In [14]:
df['label'] = False
df.head(3)

Unnamed: 0,session_id,timestamp,item_id,category,label
0,1,2014-04-07T10:51:09.277Z,622,0,False
1,1,2014-04-07T10:54:09.868Z,621,0,False
2,1,2014-04-07T10:54:46.998Z,623,0,False


In [25]:
# process by session_id
grouped = df.groupby('session_id')
grouped.head(3)

Unnamed: 0,session_id,timestamp,item_id,category,label
0,1,2014-04-07T10:51:09.277Z,622,0,False
1,1,2014-04-07T10:54:09.868Z,621,0,False
2,1,2014-04-07T10:54:46.998Z,623,0,False
4,2,2014-04-07T13:56:37.614Z,4241,0,False
5,2,2014-04-07T13:57:19.373Z,4241,0,False
...,...,...,...,...,...
99986,31814,2014-04-06T21:05:35.039Z,1723,0,False
99987,31814,2014-04-06T21:07:42.927Z,2435,0,False
99995,31812,2014-04-01T17:13:14.184Z,4253,0,False
99996,31812,2014-04-01T17:13:49.017Z,8233,0,False


In [26]:
for session_id, group in tqdm(grouped):
    print('Group:')
    print(group)
    sess_item_id = LabelEncoder().fit_transform(group.item_id)
    print('session ID:')
    print(sess_item_id)
    group = group.reset_index(drop=True)
    group['sess_item_id'] = sess_item_id
    print(group)
    node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values
    print('Node features:')
    print(node_features)
    node_features = torch.LongTensor(node_features).unsqueeze(1)
    print(node_features)
    target_nodes = group.sess_item_id.values[1:]
    print('Target nodes:')
    print(target_nodes)
    source_nodes = group.sess_item_id.values[:-1]
    print('Source nodes:')
    print(source_nodes)
    edge_index = torch.tensor([source_nodes,
                            target_nodes], dtype=torch.long)
    print(edge_index)
    x = node_features

    y = torch.FloatTensor([group.label.values[0]])

    data = Data(x=x, edge_index=edge_index, y=y)
    data_list.append(data)

  0%|          | 0/13670 [00:00<?, ?it/s]

Group:
   session_id                 timestamp  item_id  category  label
0           1  2014-04-07T10:51:09.277Z      622         0  False
1           1  2014-04-07T10:54:09.868Z      621         0  False
2           1  2014-04-07T10:54:46.998Z      623         0  False
3           1  2014-04-07T10:57:00.306Z     2050         0  False
session ID:
[1 0 2 3]
   session_id                 timestamp  item_id  category  label  sess_item_id
0           1  2014-04-07T10:51:09.277Z      622         0  False             1
1           1  2014-04-07T10:54:09.868Z      621         0  False             0
2           1  2014-04-07T10:54:46.998Z      623         0  False             2
3           1  2014-04-07T10:57:00.306Z     2050         0  False             3
Node features:
[ 621  622  623 2050]
tensor([[ 621],
        [ 622],
        [ 623],
        [2050]])
Target nodes:
[0 2 3]
Source nodes:
[1 0 2]
tensor([[1, 0, 2],
        [0, 2, 3]])





NameError: ignored