In [1]:
import io, json
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.2 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1623285 sha256=482a0e3b272c25b32761a5fc6a4fc0e0db815d7dac3840148be122e23449b764
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Import the following modules from the Surprise 
from surprise import SVD
from surprise import SVDpp
from surprise import SlopeOne
from surprise import NMF
from surprise import NormalPredictor
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import BaselineOnly
from surprise import CoClustering
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import Reader

In [7]:
df=pd.read_csv("/content/drive/MyDrive/Data Mining Data/datamining_data.csv")
df["id"]=df.id.astype(int)

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,condition,end,id_x,start,successful,therapy,cured,diagnosed,isCured,isTreated,kind,id
0,0,pc3,20120109,tr1,20111219,86.0,Th49,20120404,20111218,True,True,Cond240,0
1,1,pc3,20120217,tr2,20120203,10.0,Th45,20120404,20111218,True,True,Cond240,0
2,2,pc3,20120404,tr3,20120330,100.0,Th45,20120404,20111218,True,True,Cond240,0
3,3,pc4,19650727,tr4,19650714,100.0,Th17,19650727,19650601,True,True,Cond39,0
4,4,pc5,19731019,tr5,19730919,100.0,Th47,19731019,19730915,True,True,Cond309,0


In [9]:
data_p=df[["id","kind","therapy","successful"]]

In [10]:
data_p["id"] = data_p["id"].astype(str) + "-" + data_p["kind"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
data_p=data_p[["id","therapy","successful"]]

In [12]:
def ids_encoder(data):
    users = sorted(data['id'].unique())
    items = sorted(data['therapy'].unique())

    # create users and items encoders
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit users and items ids to the corresponding encoder
    uencoder.fit(users)
    iencoder.fit(items)

    # encode userids and itemids
    data.id = uencoder.transform(data.id.tolist())
    data.therapy = iencoder.transform(data.therapy.tolist())

    return data, uencoder, iencoder

In [None]:
data_p, uencoder, iencoder= ids_encoder(data_p)

In [None]:
data_p=data_p.astype(np.uint8)

In [None]:
data_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1027030 entries, 0 to 1027029
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype
---  ------      --------------    -----
 0   id          1027030 non-null  uint8
 1   therapy     1027030 non-null  uint8
 2   successful  1027030 non-null  uint8
dtypes: uint8(3)
memory usage: 2.9 MB


In [None]:
# Rating scale to be used as part of the process of using Surprise to predict ratings ("Scores" in this case)
reader = Reader(rating_scale=(0, 100))

# Convert the pandas dataframe into a Surprise dataset using the load_from_df() method
# The 'UserId', 'ProductId', and ratings ('Score' in this case) are passed in the exact order
data = Dataset.load_from_df(data_p[['id', 'therapy', 'successful']], reader)

In [None]:
data

<surprise.dataset.DatasetAutoFolds at 0x1d9991a9a00>

In [None]:
from surprise.model_selection import train_test_split
from surprise import accuracy

# 'user_based' in the similarity option (sim_options) is set to True in order to compute a User based collaborative filtering.
sim_options= {'name':'cosine', 
              'user_based':True
             } 

algo = KNNBaseline(sim_options=sim_options)

In [None]:
trainset, testset = train_test_split(data, test_size=0.04)
# Fit algorithm to the training set
algo.fit(trainset)
  # Predict ratings for the test set
test_set_predictions=algo.test(testset)
# Calculate the root mean squared error(RMSE)
print(algo)
accuracy.rmse(test_set_predictions)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
<surprise.prediction_algorithms.knns.KNNBaseline object at 0x000001D99E8C9190>
RMSE: 34.0307


34.03066775390996

In [None]:
trainset, testset = train_test_split(data, test_size=0.04)
# Fit algorithm to the training set
algo.fit(trainset)
  # Predict ratings for the test set
test_set_predictions=algo.test(testset)
# Calculate the root mean squared error(RMSE)
print(algo)
accuracy.rmse(test_set_predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
<surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000001D9A0A58F70>
RMSE: 34.0050


34.00498747929172

## Experiment 2 patients  selected who had at least one trial for same condition and whose most recent trial had ended with a higher succesful rate

In [13]:
data_exp=data_p["id"]+","+data_p["therapy"]
data_exp=pd.DataFrame(data_exp)
data_exp["successful"]=data_p["successful"]
data_exp.columns=["id","successful"]
data_exp_1=pd.DataFrame(data_exp.groupby("id").successful.max()).reset_index()
dt=data_exp_1["id"].str.split(",", n = 1, expand = True)
dt["successful"]=data_exp_1["successful"]
dt.columns=["id","therapy","successful"]

In [14]:
dt, uencoder, iencoder= ids_encoder(dt)

In [15]:
dt=dt.astype(np.uint8)

In [16]:
#data=dt.sample(50000)

In [17]:
# Rating scale to be used as part of the process of using Surprise to predict ratings ("Scores" in this case)
reader = Reader(rating_scale=(0, 100))

# Convert the pandas dataframe into a Surprise dataset using the load_from_df() method
# The 'UserId', 'ProductId', and ratings ('Score' in this case) are passed in the exact order
data = Dataset.load_from_df(dt[['id', 'therapy', 'successful']], reader)

In [18]:
from surprise.model_selection import train_test_split
from surprise import accuracy

# 'user_based' in the similarity option (sim_options) is set to True in order to compute a User based collaborative filtering.
sim_options= {'name':'cosine', 
              'user_based':True
             } 

In [19]:
algo = KNNBaseline(sim_options=sim_options)
trainset, testset = train_test_split(data, test_size=0.20)
# Fit algorithm to the training set
algo.fit(trainset)
  # Predict ratings for the test set
test_set_predictions=algo.test(testset)
# Calculate the root mean squared error(RMSE)
print(algo)
accuracy.rmse(test_set_predictions)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
<surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f8959a368d0>
RMSE: 34.0650


34.06496052712595

In [20]:
algo = KNNWithMeans(sim_options=sim_options)
trainset, testset = train_test_split(data, test_size=0.20)
# Fit algorithm to the training set
algo.fit(trainset)
  # Predict ratings for the test set
test_set_predictions=algo.test(testset)
# Calculate the root mean squared error(RMSE)
print(algo)
accuracy.rmse(test_set_predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
<surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7f89541c5b50>
RMSE: 34.0497


34.04968583148692