In [None]:
from scripts import *

# Exploration

## Collaborative filtering

### Data Restructuring

- Grouping by user to get info on their commits and which target_interest their commits belong to in a quantitative way
- Using the user groups, we can again group the df by user groups and time and now have very few groups, and we can do regression on their activity over time

In [None]:
userdf = pd.DataFrame({
    "user": df["user"],
#    "time_week" : list(df["time_week"]),
    "target_interest_id" : list(y),
    "cluster_id" : list(cluster_semi_s_hdb.labels_)
    })


userdf["target_interest"] = userdf["target_interest_id"].apply(lambda x: target_interest[x-1])


userdf = userdf.groupby("user").agg(list).reset_index()


print(len(userdf))

### User Grouping

In [None]:
#userdf_dict = userdf[["user","time_week","target_interest_word"]].copy()
userdf_with_dict = userdf[["user","target_interest"]].copy()


userdf_with_dict["target_interest_dict"] = userdf["target_interest"].apply(lambda x: dict_per_user(x, target_interest))

userdf_with_dict.drop(columns=["target_interest"], inplace=True)

print(len(userdf_with_dict))

userdf_with_dict.head(3)

In [None]:
# normalising the userdf
#userdf = (userdf-userdf.min())/(userdf.max()-userdf.min())

target_interest_matrix = np.array(userdf["target_interest"].apply(lambda x: dict_per_user(x, target_interest)))

df_user_interest_matrix = pd.DataFrame(list(target_interest_matrix))

target_interest_matrix = df_user_interest_matrix.to_numpy()

df_user_interest_matrix.insert(0, "user", userdf["user"])

df_user_interest_matrix

In [None]:
target_interest_matrix[0:5]

In [None]:
colab_clusters = HDBSCAN(min_cluster_size=90, min_samples=10, metric='euclidean', cluster_selection_method='eom').fit(target_interest_matrix)

print(f"""
    Full dimensionality clustering output:
    Len of colab clusters: {len(colab_clusters.labels_)}
    Number of clusters: {len(set(colab_clusters.labels_)) - 1}
    Number of rows as outliers: {colab_clusters.labels_.tolist().count(-1)}
""")


In [None]:
colab_umap = UMAP(n_neighbors=15, min_dist=0.0).fit_transform(target_interest_matrix)

In [None]:
colab_resdf = pd.DataFrame({
    "x" : colab_umap[:, 0], 
    "y" : colab_umap[:, 1], 
    "cluster" : colab_clusters.labels_
})

# with few clusters you can turn on and off outliers with the -1 label
#colab_resdf = colab_resdf[colab_resdf["cluster"] != -1]

#turning cluster to str for discrete color
colab_resdf["cluster"] = colab_resdf["cluster"].astype(str)

fig_colab = px.scatter(colab_resdf, x="x", y="y", color="cluster", title="Colab clustering", width=800, height=800, range_x=[-25, 25], range_y=[-25, 25])

fig_colab.show()

# TODO add one visualisation without time grouping
# this would give us the "true" user groups, and then we could see if they moved around without breaking up the group too much
# also it is not a bug that there is overlap of clusters, as the clustering takes place before umap

In [None]:
## TODO
# - now we have to do this per user.
# - we need to look at what a given user is "comitting" about interest wise, and then see which cluster that user is in
# - then when we    title="Timeline of commits by interest",                                                         

### user cluster time grouping

In [None]:
# making dict to connect username and cluster id
userdId_groupID_dict = dict(zip(df_user_interest_matrix["user"], colab_clusters.labels_))

if len(userdId_groupID_dict) - len(df_user_interest_matrix) != 0:
    print("WARNING: dict and userdf_ex are not the same length")

In [None]:
df.columns

In [None]:
usergroupdf = pd.DataFrame({
    "user": df["user"],
    "time_sec" : list(df["time_sec"]),
    "target_interest_id" : list(y),
    })

# mapping in the target interest
usergroupdf["target_interest"] = usergroupdf["target_interest_id"].apply(lambda x: target_interest[x-1])

# Setting cluster id on the users to get the cluster id for each user
usergroupdf["user_group_id"] = usergroupdf["user"].apply(lambda x: userdId_groupID_dict[x])

# Making time sec into time day
usergroupdf["time_day"] = usergroupdf["time_sec"].apply(lambda x: x//(60*60*24))

usergroupdf.sample(5)

### Missing data interpolation

- Insert nan rows per missing date
- Fill nan with smoothed values
  - (this is called interpolation)

In [None]:
first_day =  df["time_sec"].tail(1).values[0] // (60*60*24)
last_day = df["time_sec"].head(1).values[0] // (60*60*24) 

print(f'''
first day: {first_day}
last day: {last_day}
ammount of days in df: {(df["time_sec"].head(1).values[0] - df["time_sec"].tail(1).values[0]) // (60*60*24)}
''')

days_series = pd.Series(range(first_day, last_day + 1))

In [None]:
usergroupdf = usergroupdf.groupby(["user_group_id", "time_day"]).agg(list).reset_index()


print(f'User group equal to cluster groups: {len(usergroupdf["user_group_id"].unique()) == len(set(colab_clusters.labels_))}')
print(len(usergroupdf))


usergroupdf.head(3)

In [None]:
# making the interest matrix again for user groups
usergroup_target_interest_matrix = np.array(usergroupdf["target_interest"].apply(lambda x: dict_per_user(x, target_interest)))

df_group_interest_matrix = pd.DataFrame(list(usergroup_target_interest_matrix))

usergroup_target_interest_matrix = df_group_interest_matrix.to_numpy()

#df_usergroup_interest_matrix = usergroupdf[["user_group_id", "time_day", "target_interest"]].copy()

df_group_interest_matrix.insert(0, "user_group_id", usergroupdf["user_group_id"])
df_group_interest_matrix.insert(1, "time_day", usergroupdf["time_day"])


df_group_interest_matrix

In [None]:
df_group_interest_matrix.describe()

In [None]:
days_frame = pd.DataFrame({"time_day" : days_series})

In [None]:
# making list of dfs based on usergroup

usergroup_df_list = []

for usergroup in df_group_interest_matrix["user_group_id"].unique():
  # interpolationg the missing days
  
  df_this_group = df_group_interest_matrix[df_group_interest_matrix["user_group_id"] == usergroup]

  df_this_group.drop(columns=["user_group_id"], inplace=True)

  df_this_group = pd.merge(days_frame, df_this_group, how="outer", on="time_day")

  #df_this_group.set_index("time_day", inplace=True)

  for columns in df_this_group.columns:
    if columns != "time_day":
      df_this_group[columns][0] = 0.0
      df_this_group[columns] = df_this_group[columns].astype(float)
      df_this_group[columns].interpolate(method="spline", order=3, inplace=True) # cubic, tried linear, but it was not as good
      df_this_group.fillna(method="ffill", inplace=True)

  #df_this_group = df_this_group.interpolate(method="pad", axis=0)
  
  usergroup_df_list.append(df_this_group)

In [None]:
for x in usergroup_df_list:
  print(x.shape)

In [None]:
usergroup_df_list[5].isna().sum()

In [None]:
usergroup_df_list[5]

### Neural Net Custom Model
This will be scuffed


In [None]:
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [None]:
# we do not need flatten as we do not have 2D input

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        

        # nn.Sequential sets the layers in order
        self.linear_relu_stack = nn.Sequential(

          # our input is 20
            nn.Linear(20, 40),
            nn.ReLU(),

            # adding two more hidden layers
            nn.Linear(40, 40),
            nn.ReLU(),
            nn.Linear(40, 40),
            nn.ReLU(),

          # our output is 20
            nn.Linear(40, 20),
            nn.ReLU(),
            #nn.Softmax(dim=1)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
model = NeuralNetwork().to(device)
print(model)

### Training the Model

In [None]:
from math import floor

# taking one of the usergrop dfs to test
sampledf = usergroup_df_list[1]

print(len(sampledf))

group_tensor = torch.tensor(sampledf.drop(columns=["time_day"]).values, device=device).float()


start = floor(len(group_tensor) * (7/10))
end = len(group_tensor) - start

group_tensor_train, group_tensor_test = torch.split(group_tensor, [start, end])



print(f"""
{group_tensor.size()}
{group_tensor_train.size()}
{group_tensor_test.size()}
""")

len(group_tensor[0])


In [None]:
#criterion = nn.NLLLoss()

# this is the loss function we want to use? MSE is often used when predicting numerical stuff
criterion = nn.MSELoss()


optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
# consider cranking this up as the loss kept going down (maybe 20 would be sane)
for epoch in range(80):
  running_loss = 0

  #prediction_row_tensor = torch.tensor([0,20]).to(device)

  #prediction_row_tensor.cat(group_tensor_train[0])


  #prediction_row_tensor = torch.cat((prediction_row_tensor, group_tensor_train[0]), dim=0)

  prediction_row_tensor = group_tensor_train[0:1]


  for i in range(len(group_tensor_train) - 1): # - 1 because we are getting the next row yeee
    
    this_row = group_tensor_train[i]
    next_row = group_tensor_train[i+1]

    optimizer.zero_grad()

    next_row_prediction = model(this_row)

    prediction_row_tensor = torch.cat((prediction_row_tensor, next_row_prediction[None, ...]), dim=0)

    loss = criterion(next_row_prediction, next_row)

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    
  else:
    print(f"Epoch {epoch} - Training loss: {running_loss/len(group_tensor_train)}")

In [None]:
prediction_row_tensor.size()

In [None]:
group_tensor[5]

### Predicting / Looking at results

In [None]:
# getting the predictionns for the test set

for row in group_tensor_test:
    prediction_row_tensor = torch.cat((prediction_row_tensor, model(row)[None, ...]), dim=0)

In [None]:
start

In [None]:
prl = prediction_row_tensor.cpu().detach().numpy()
gp = group_tensor.cpu().detach().numpy()

fig = px.line(pd.DataFrame(list(prl))[start: ], height=600, width=1200)# [start: ]

fig.show()

In [None]:
fig = px.line(pd.DataFrame(list(gp))[start: ], height=600, width=1200) # [start: ]

fig.show()

## Exploring water simulation based prediction potential

In [None]:

print(f"""
    Bounds of the uembs
    
    x axis:
    {min(uembs[:,0])}
    {max(uembs[:,0])}
    
    "y axis"
    {min(uembs[:,1])}
    {max(uembs[:,1])}
""")

Kan sette ramme til vann prediction på +- 25 på begge akser

512 x 512*2 pixels i det spacet

lage neste frame i animasjonen

gi to frames av fortid
- kan gi en frame per uke per bruker
- kan ha en farge per bruker gruppe