In [1]:
import random

import numpy as np
import pandas as pd
import seaborn as sns

from algorithm_gd import forward, loss_fn, r2_score, gradient_m, gradient_c, get_iteration_vs_accuracy_data

In [2]:
df = pd.read_csv("SOCR-HeightWeight.csv")
df.rename(columns={"Height(Inches)": "X", "Weight(Pounds)": "y"}, inplace=True)
df

Unnamed: 0,Index,X,y
0,1,65.78331,112.9925
1,2,71.51521,136.4873
2,3,69.39874,153.0269
3,4,68.21660,142.3354
4,5,67.78781,144.2971
...,...,...,...
24995,24996,69.50215,118.0312
24996,24997,64.54826,120.1932
24997,24998,64.69855,118.2655
24998,24999,67.52918,132.2682


In [3]:
no_of_nodes = 4

In [4]:
node_data_partitions = np.array_split(df, no_of_nodes)
node_data_partitions

  return bound(*args, **kwds)


[      Index         X         y
 0         1  65.78331  112.9925
 1         2  71.51521  136.4873
 2         3  69.39874  153.0269
 3         4  68.21660  142.3354
 4         5  67.78781  144.2971
 ...     ...       ...       ...
 6245   6246  68.55917  131.6865
 6246   6247  69.82956  152.9133
 6247   6248  68.56366  135.9443
 6248   6249  69.89716  135.4161
 6249   6250  67.22911  126.9908
 
 [6250 rows x 3 columns],
        Index         X          y
 6250    6251  68.27784  151.49860
 6251    6252  67.70956  120.92840
 6252    6253  68.63497  143.90900
 6253    6254  66.24494  148.30150
 6254    6255  65.92820   98.38391
 ...      ...       ...        ...
 12495  12496  68.03870  138.32280
 12496  12497  69.78338  124.38070
 12497  12498  67.94391  131.20250
 12498  12499  66.63849  126.91320
 12499  12500  68.45079  127.41400
 
 [6250 rows x 3 columns],
        Index         X         y
 12500  12501  70.07106  113.5241
 12501  12502  69.31026  114.3171
 12502  12503  71.75837  1

In [5]:
for i, node_data in enumerate(node_data_partitions):
    df.loc[node_data.index, "node"] = i

df["node"] = df["node"].astype("int")
df

Unnamed: 0,Index,X,y,node
0,1,65.78331,112.9925,0
1,2,71.51521,136.4873,0
2,3,69.39874,153.0269,0
3,4,68.21660,142.3354,0
4,5,67.78781,144.2971,0
...,...,...,...,...
24995,24996,69.50215,118.0312,3
24996,24997,64.54826,120.1932,3
24997,24998,64.69855,118.2655,3
24998,24999,67.52918,132.2682,3


In [6]:
# def __gen_test_data_partition_frm_df(partitions, df):
#     shuffled = df.sample(frac=1)
#     result = np.array_split(shuffled, partitions)
#     return result

In [7]:
# df["partition"] = -1
# for i in range(no_of_nodes):
#     node_filter = df["node"] == i
#     node_df = df[node_filter]
#     partitions = __gen_test_data_partition_frm_df(no_of_nodes, node_df)
#     for i, p in enumerate(partitions):
#         df.loc[df.index.isin(p.index.values), "partition"] = i

In [8]:
df

Unnamed: 0,Index,X,y,node
0,1,65.78331,112.9925,0
1,2,71.51521,136.4873,0
2,3,69.39874,153.0269,0
3,4,68.21660,142.3354,0
4,5,67.78781,144.2971,0
...,...,...,...,...
24995,24996,69.50215,118.0312,3
24996,24997,64.54826,120.1932,3
24997,24998,64.69855,118.2655,3
24998,24999,67.52918,132.2682,3


In [9]:
# df.drop(df.tail(8).index,inplace=True) # drop last n rows

In [10]:
df['X_sqr'] = df['X'].apply(lambda x: np.square(x))
df['Xy'] = df[['X', 'y']].apply(lambda row: row.X * row.y, axis=1)

In [11]:
L = 0.0001
stop_threshold = 0.0001

In [12]:
doubly_stochastic_matrix_config = [
    [1 / 2, 1 / 4, 1 / 8, 1 / 8],
    [1 / 4, 3 / 4, 0, 0],
    [1 / 8, 0, 7 / 8, 0],
    [1 / 8, 0, 0, 7 / 8],
]

In [13]:
inv = [1.9, 1.9, 1.9, 1.9]
s0 = [0.0, 0.0, 0.0, 0.0]
s1 = [1.7, 1.9, 1.1, 1.8]
s2 = [1.7, 1.9, 1.1, 1.9]
inv2 = [1.87, 1.87, 1.87, 1.87]

In [14]:
ns = [1.8, 1.5, 1.6, 1.8]

In [15]:
ns_025 = [1.3, 1.65, 1.675, 1.0]

In [16]:
def get_node_df(pos):
    df_copy = df[df["node"]==pos]
    return df_copy

In [17]:
def get_front(state, pos):
    dbl_st_mat = doubly_stochastic_matrix_config[pos]
    return sum(i*j for i, j in zip(dbl_st_mat, state))

In [18]:
def get_p(pos):
    df_copy = get_node_df(pos)
    return -2/df_copy.Index.count()

In [19]:
def get_q(pos):
    df_copy = get_node_df(pos)
    return np.sum(df_copy['Xy'])

In [20]:
def get_r(pos):
    df_copy = get_node_df(pos)
    return np.sum(df_copy['X_sqr'])

In [21]:
# p = get_p()
# q = get_q()
# r = get_r()
# p, q, r

In [22]:
def get_back(s, pos):
    m = s[pos]
    df_copy = get_node_df(pos)
    return np.sum(df_copy['Xy'] - m * df_copy['X_sqr']) 

In [23]:
# s = ns_025[:]
# s_copy = s[:]
# print(s)
# for i in range(100):
#     for pos in range(len(s)):
#     # for pos in [3]:
#         p = get_p(pos)
#         frnt = get_front(s_copy, pos)
#         back = get_back(s_copy, pos)
        
#         # input()
#         # s[pos] = frnt - L * p * (q - s[pos] * r)
#         tmp = frnt - L * p * back
#         print("Difference:", abs(s[pos] - tmp))
#         s[pos] = tmp
#     s_copy = s[:]
#     print(s)
#     print()

In [43]:
ns_025 = [1.85, 1.85, 1.85, 1.85]

In [44]:
s = ns_025[:]
print(s)
for pos in range(len(s)):
    s_copy = ns_025[:]
    for i in range(100):
        p = get_p(pos)
        s_copy[pos] = s[pos]
        frnt = get_front(s_copy, pos)
        back = get_back(s_copy, pos)
        tmp = frnt - L * p * back
        if abs(s[pos] - tmp) <= stop_threshold:
            break
        s[pos] = tmp
    else:
        print("Not converging for", pos)
print(s)
print()

[1.85, 1.85, 1.85, 1.85]
[np.float64(1.8647388421646927), np.float64(1.8661773879426138), np.float64(1.8657853460170881), np.float64(1.8663749688174782)]



In [26]:
# s = s1[:]
# s_copy = s[:]
# print(s)
# for i in range(5):
#     for pos in range(len(s)):
#     # for pos in [3]:
#         frnt = get_front(s_copy, pos)
#         back = get_back(s, pos)
#         # s[pos] = frnt - L * p * (q - s[pos] * r)
#         s[pos] = frnt - L * p * back
#         # print(frnt, s)
#     s_copy = s[:]
#     print(s)