In [1]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from torch_geometric.nn import Node2Vec

import os
import json
import csv 
import time

import torch
import pandas as pd
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T

In [2]:
torch.__version__

'1.10.2'

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
device

'cuda'

In [None]:
DATA_PATH = '../data/'

Data and Best Performing Model Loading

In [8]:
# best parameters after tuning
WALK = 50
P = 1
Q = 1

In [5]:
# load index dictionary
with open(f'{DATA_PATH}reddit_index.json') as f:
    reddit_dict = json.load(f)

In [6]:
# read reddit s2d as a Data object in pytorch geometric
df = pd.read_csv(f'{DATA_PATH}reddit_subreddit_to_domain__gt-01-urls.csv', header=None)
source_nodes = df.iloc[:,0].apply(lambda x: reddit_dict[x]).values.tolist()
target_nodes = df.iloc[:,1].apply(lambda x: reddit_dict[x]).values.tolist()
num_nodes = len(set(source_nodes).union(set(target_nodes)))
weight = df.iloc[:,2].values.tolist()
edge_index = torch.tensor([source_nodes, target_nodes])
edge_attr = torch.tensor(weight)[:,None]
data = Data(edge_index=edge_index, edge_attr=edge_attr)

data.num_nodes = num_nodes
transform = T.ToUndirected()
data = transform(data)

In [7]:
# read domain ideology for evaluation
domain_ideology = pd.read_csv(f'{DATA_PATH}robertson_et_al.csv')
domain_ideology = domain_ideology[['domain', 'score']].copy()
domain_ideology['id'] = domain_ideology['domain'].apply(lambda x: reddit_dict[x] if x in reddit_dict else None)
domain_ideology = domain_ideology[domain_ideology['id'].notna()].reset_index(drop=True)
domain_ideology['id'] = domain_ideology['id'].astype('int64')

train = domain_ideology.sample(frac=0.8,random_state=42)
test = domain_ideology[~domain_ideology.index.isin(train.index)]
train_sub = train.sample(frac=0.8, random_state=24)
val = train[~train.index.isin(train_sub.index)]

train_x, train_y = train_sub['id'].tolist(), train_sub['score'].tolist()
val_x, val_y = val['id'].tolist(), val['score'].tolist()

In [13]:
# model specification
MODEL_PATH = 'results/jupyter_v100_1.pth'
model = torch.nn.DataParallel(Node2Vec(data.edge_index, embedding_dim=128, 
                 walk_length=WALK, context_size=10, walks_per_node=10, 
                 num_negative_samples=1, p=P, q=P, sparse=True))
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [14]:
# remove DataParallel wrapper
model = model.module

In [15]:
# embed training data
model.eval()
z = model()

train_X = z[train_x].detach().cpu().numpy()

Classifier Loading

In [16]:
# specify predictors and setting random states
clf1 = Ridge()
clf2 = RandomForestRegressor(random_state=42)
clf3 = GradientBoostingRegressor(random_state=42)

In [17]:
# specify parameters
param1 = {}
param1['clf__alpha'] = [0.001, 0.01, 0.1, 1]
param1['clf'] = [clf1]

param2 = {}
param2['clf__n_estimators'] = [100, 200, 500]
param2['clf'] = [clf2]

param3 = {}
param3['clf__learning_rate'] = [0.001, 0.01, 0.1, 1]
param3['clf__n_estimators'] = [50, 100, 200]
param3['clf'] = [clf3]

In [18]:
# specify pipeline
pipeline = Pipeline([('clf', clf1)])
params = [param1, param2, param3]

# perform grid search
gs = GridSearchCV(pipeline, params, cv=3, scoring='neg_mean_squared_error', verbose=3).fit(train_X, train_y)
gs.best_params_

Fitting 3 folds for each of 19 candidates, totalling 57 fits
[CV 1/3] END ....clf=Ridge(), clf__alpha=0.001;, score=-0.133 total time=   0.5s
[CV 2/3] END ....clf=Ridge(), clf__alpha=0.001;, score=-0.125 total time=   0.0s
[CV 3/3] END ....clf=Ridge(), clf__alpha=0.001;, score=-0.130 total time=   0.0s
[CV 1/3] END .....clf=Ridge(), clf__alpha=0.01;, score=-0.133 total time=   0.0s
[CV 2/3] END .....clf=Ridge(), clf__alpha=0.01;, score=-0.125 total time=   0.0s
[CV 3/3] END .....clf=Ridge(), clf__alpha=0.01;, score=-0.130 total time=   0.0s
[CV 1/3] END ......clf=Ridge(), clf__alpha=0.1;, score=-0.133 total time=   0.0s
[CV 2/3] END ......clf=Ridge(), clf__alpha=0.1;, score=-0.125 total time=   0.0s
[CV 3/3] END ......clf=Ridge(), clf__alpha=0.1;, score=-0.130 total time=   0.0s
[CV 1/3] END ........clf=Ridge(), clf__alpha=1;, score=-0.133 total time=   0.0s
[CV 2/3] END ........clf=Ridge(), clf__alpha=1;, score=-0.125 total time=   0.0s
[CV 3/3] END ........clf=Ridge(), clf__alpha=1;,

{'clf': GradientBoostingRegressor(random_state=42),
 'clf__learning_rate': 0.1,
 'clf__n_estimators': 100}

In [21]:
gs.cv_results_

{'mean_fit_time': array([1.85256243e-01, 4.35439746e-03, 4.14419174e-03, 4.15094694e-03,
        3.75195276e+01, 7.48417801e+01, 1.87183178e+02, 8.24913025e+00,
        1.64839873e+01, 3.29581807e+01, 8.19788869e+00, 1.63651338e+01,
        3.27300288e+01, 8.20304958e+00, 1.65780047e+01, 3.34732561e+01,
        8.37802831e+00, 1.67594554e+01, 3.35824307e+01]),
 'std_fit_time': array([2.55382811e-01, 1.92237570e-04, 8.66311768e-05, 1.40265297e-04,
        1.42322220e-01, 9.05631892e-02, 3.35675957e-01, 2.79675177e-02,
        4.26037867e-02, 8.52930279e-02, 1.06145289e-02, 3.65235653e-03,
        3.41887919e-02, 8.65824729e-03, 3.19386879e-02, 3.33872816e-02,
        4.24876155e-03, 1.56006827e-02, 1.45948614e-02]),
 'mean_score_time': array([0.00105532, 0.00082548, 0.00082143, 0.00086347, 0.0706474 ,
        0.13901114, 0.34678618, 0.00347996, 0.00588465, 0.01023769,
        0.00380055, 0.00641163, 0.01150529, 0.00383854, 0.00594743,
        0.00918651, 0.00328191, 0.00517225, 0.008637

Saving

In [22]:
df = pd.concat([pd.DataFrame(gs.cv_results_["params"]),pd.DataFrame(gs.cv_results_["mean_test_score"], columns=["mse"])],axis=1)
df['mse'] = -df['mse'] 
df['clf'] = df['clf'].apply(lambda x: x.__class__)

In [23]:
# save tuning results
df.to_csv('gs_cv_results.csv')

In [24]:
df

Unnamed: 0,clf,clf__alpha,clf__n_estimators,clf__learning_rate,mse
0,<class 'sklearn.linear_model._ridge.Ridge'>,0.001,,,0.129337
1,<class 'sklearn.linear_model._ridge.Ridge'>,0.01,,,0.129336
2,<class 'sklearn.linear_model._ridge.Ridge'>,0.1,,,0.129325
3,<class 'sklearn.linear_model._ridge.Ridge'>,1.0,,,0.12922
4,<class 'sklearn.ensemble._forest.RandomForestR...,,100.0,,0.130918
5,<class 'sklearn.ensemble._forest.RandomForestR...,,200.0,,0.130535
6,<class 'sklearn.ensemble._forest.RandomForestR...,,500.0,,0.130283
7,<class 'sklearn.ensemble._gb.GradientBoostingR...,,50.0,0.001,0.185842
8,<class 'sklearn.ensemble._gb.GradientBoostingR...,,100.0,0.001,0.182913
9,<class 'sklearn.ensemble._gb.GradientBoostingR...,,200.0,0.001,0.177722
