In [380]:
## imports ##
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn import svm, decomposition, tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn import metrics
from sklearn.model_selection import learning_curve, GridSearchCV, cross_val_score, validation_curve
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_selection import RFECV, SelectKBest, chi2
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib inline

warnings.simplefilter("ignore")
np.random.seed(100)

In [391]:
walking_data_v1 = pd.read_csv("../Data/MAX_WALKING_TRAINING_DATA.csv") \
                        .reset_index(drop = True).dropna()
walking_data_v2 = pd.read_csv("../Data/MAX_WALKING_TRAINING_DATA_V2.csv") \
                        .reset_index(drop = True).dropna()
walking_data_passive = pd.read_csv("../Data/MAX_WALKING_TRAINING_DATA_PASSIVE.csv") \
                        .reset_index(drop = True).dropna()

data = pd.concat([walking_data_v1, walking_data_v2]).dropna().reset_index(drop = True)

# Finding Similar Users in mPowerV1 vs mPowerV2

- For each healthcodes, take closest healthcodes based on certain distance metrics
- Similarities will be assessed on KNN-Algorithm
- Distance to each healthcodes will be stored into a dataframe

In [393]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [394]:
def getKNN_user2user(data, no_neighbors, distance_metrics):
    data = data.set_index("healthCode")
    min_max_scaler = MinMaxScaler()
    walking_features = min_max_scaler.fit_transform(data)
    nbrs = NearestNeighbors(n_neighbors = no_neighbors, 
                            algorithm = 'auto', 
                            metric = distance_metrics).fit(walking_features)
    distance, indices = nbrs.kneighbors(walking_features)
    dict_ = {}
    data = data.reset_index()
    for dist, indice in zip(distance, indices):
        dict_[data.ix[indice[0]]["healthCode"]] = {"healthCode": data.loc[indice[1]]["healthCode"],
                                              "distance": dist[1]}
    sim_data = (pd.DataFrame(dict_).T)
    list_ = walking_data_v2["healthCode"].unique()
    sim_data = (sim_data.loc[~sim_data.index.isin(list_)])[sim_data["healthCode"].isin(list_)]
    return sim_data.sort_values(by = "distance", ascending = False)

## 1. Euclidean Distance

In [395]:
getKNN_user2user(data, 2, "euclidean")

Unnamed: 0,distance,healthCode
72c95fc8-db97-4865-8bc3-a42e21d43e75,0.427246,194fd2c2-ca8a-4e12-91bc-24aa06ed55c1
46715e2d-faf6-41fd-abb2-92c06fbab6d9,0.273241,77f1e0ed-2815-4038-841d-d3daaadef680
ce5cb455-f71f-4727-94d5-13f860bf4ce0,0.246708,2783234d-ee3a-4253-90ec-2d5428fea7c5
0fb2371f-2456-430e-a9a4-f0cfba39ae30,0.242468,ef5ac368-b83c-4b8b-a6b3-69a1eb09ce59
1137bf6f-92b4-426f-acd5-87fcaacee4c1,0.221525,2783234d-ee3a-4253-90ec-2d5428fea7c5
416a984c-4a94-452b-93be-499c1ac374d6,0.170371,55702360-c266-4528-bb9d-caaa5b197225


## 2. Jaccard Distance

In [396]:
getKNN_user2user(data, 2, "jaccard")

Unnamed: 0,distance,healthCode


## 3. Manhattan Distance

In [397]:
getKNN_user2user(data, 2, "manhattan")

Unnamed: 0,distance,healthCode
90550db0-af30-495f-8d7c-5749577cdae2,2.32593,6e7005b4-d967-4b4b-a5f0-3a3e518cd138
72c95fc8-db97-4865-8bc3-a42e21d43e75,1.10976,f5e07db6-e37e-4663-97c6-82627e216b61
1137bf6f-92b4-426f-acd5-87fcaacee4c1,0.574833,2783234d-ee3a-4253-90ec-2d5428fea7c5
46715e2d-faf6-41fd-abb2-92c06fbab6d9,0.558465,77f1e0ed-2815-4038-841d-d3daaadef680
ce5cb455-f71f-4727-94d5-13f860bf4ce0,0.543029,2783234d-ee3a-4253-90ec-2d5428fea7c5
0fb2371f-2456-430e-a9a4-f0cfba39ae30,0.47808,ef5ac368-b83c-4b8b-a6b3-69a1eb09ce59
416a984c-4a94-452b-93be-499c1ac374d6,0.402867,c4afc1ed-56e3-4ce5-9417-8712cb4eb8dc


## 4. Cosine Similarities

In [398]:
getKNN(data, 2, "cosine")

Unnamed: 0,distance,healthCode
15cd9496-900a-44bb-948b-12a24b101615,0.097246,bf600be5-e1b7-47e6-ba44-03e11bcf1edf
72c95fc8-db97-4865-8bc3-a42e21d43e75,0.0362605,194fd2c2-ca8a-4e12-91bc-24aa06ed55c1
866b1e91-f3c8-4884-aa78-0d0b53c13b20,0.0301117,1bce95ce-95d8-4bb5-8df4-bcf46418d312
46715e2d-faf6-41fd-abb2-92c06fbab6d9,0.0179735,77f1e0ed-2815-4038-841d-d3daaadef680
0fb2371f-2456-430e-a9a4-f0cfba39ae30,0.0134101,ef5ac368-b83c-4b8b-a6b3-69a1eb09ce59
ce5cb455-f71f-4727-94d5-13f860bf4ce0,0.00998501,c4afc1ed-56e3-4ce5-9417-8712cb4eb8dc
1137bf6f-92b4-426f-acd5-87fcaacee4c1,0.00881263,2783234d-ee3a-4253-90ec-2d5428fea7c5
b1f4d4d6-5247-48c4-9990-28a13d34fc19,0.00556746,d2c2a238-36ab-493e-9728-589eb0797fb3
416a984c-4a94-452b-93be-499c1ac374d6,0.00407586,55702360-c266-4528-bb9d-caaa5b197225


# Finding Similar Users in mPowerV2 vs Passive

In finding similarities, between mPowerV2, rather than assessing two similar users per their healthCodes, we want to see if how similar a certain active healthCodes is to their passive counterpart.

In [437]:
from sklearn.metrics.pairwise import cosine_similarity

In [450]:
data = pd.concat([walking_data_v2, walking_data_passive]).dropna().reset_index(drop = True)

In [451]:
list_ = data[data["healthCode"].duplicated()]["healthCode"]

In [453]:
data = data[data["healthCode"].isin(list_)]
data = data.set_index("healthCode")
min_max_scaler = MinMaxScaler()
walking_features = min_max_scaler.fit_transform(data)

In [457]:
matrix = pd.DataFrame(cosine_similarity(walking_features), index = data.index, columns = data.index)