# Imports

In [12]:
import torch
import pandas as pd
from scipy.spatial.distance import cosine
from tqdm import tqdm
import pickle

In [2]:
training_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df=pd.read_csv("Data/cf_train.csv")

# Custom Dataloader

In [32]:
class CustomDataset:
    def __init__(self,dataframe,batch_size,device =training_device,shuffle=False):
        self.df=dataframe
        self.batch_size=batch_size
        self.columns_to_drop=['row_num','day','era','target_10_val','target_5_val','sigma','day_no']
        self.X = self.df.drop(self.columns_to_drop, axis=1)
        self.y=self.df['target_10_val']
        self.device=device
        self.shuffle=shuffle

    def generate_batches_with_labels(self,idx):
        data=self.X.iloc[:max(0,idx-10)]
        labels=self.y.iloc[:max(0,idx-10)]
        dataset =  torch.utils.data.TensorDataset(torch.tensor(data.values),torch.tensor(labels.values))
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=self.shuffle)
        data_unseen = self.X.iloc[max(0,idx-9):idx+1]
        labels_unseen=self.y.iloc[max(0,idx-9):idx+1]
        data_unseen,labels_unseen = torch.tensor(data_unseen.values).to(self.device),torch.tensor(labels_unseen.values).to(self.device)
        return dataloader, (data_unseen,labels_unseen)
    
        
        

In [35]:
customDS = CustomDataset(df,64,shuffle = False)
supervised_dl, unsupervised_data = customDS.generate_batches_with_labels(77)

## GET SIMILARITY

In [33]:
# Find Similarity. may help in choosing datasets:
cosine_threshold=0.4
columns_to_drop=['row_num','day','era','target_10_val','target_5_val','sigma','day_no']
temp_df=df.drop(columns_to_drop, axis=1)
similarity=[]
for i in tqdm(range(len(temp_df))):
    similar=set()
    row=temp_df.iloc[i]
    found=0
    for j in similarity:
        old=temp_df.iloc[next(iter(j))]
        if cosine(old,row)>=cosine_threshold:
            j.add(i)
            found=1
            break
    if not found:
        similar.add(i)
        similarity.append(similar)

  0%|          | 12/62400 [00:00<13:14, 78.57it/s]

  5%|▍         | 2879/62400 [00:42<49:34, 20.01it/s] 

In [None]:
len(similarity)

2278

In [None]:
for i in similarity:
    print(len(i))

51956
2880
636
1150
315
65
92
505
2
238
108
312
45
146
185
13
517
34
58
39
35
48
5
57
15
6
12
5
1
5
1
1
1
1
1
1
9
2
4
19
1
18
1
5
1
1
1
1
1
1
1
1
17
4
1
1
1
38
77
1
1
12
1
2
1
1
1
15
1
1
1
1
72
1
1
1
1
2
1
9
1
9
1
1
6
3
6
1
1
1
1
1
7
9
2
6
1
7
6
1
2
5
18
8
1
1
1
1
1
1
1
1
1
1
5
1
1
1
12
7
1
1
1
1
1
2
6
4
8
1
1
1
1
5
1
1
1
1
5
1
14
1
2
1
1
1
1
1
1
1
1
1
1
1
1
5
1
1
1
1
19
3
1
1
6
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
5
1
1
1
1
4
1
12
1
2
2
1
1
1
5
1
8
1
1
1
1
1
1
1
1
5
28
5
2
1
1
1
1
3
2
2
1
3
1
3
1
2
1
1
1
1
1
1
1
1
2
1
1
1
1
4
1
1
3
1
1
4
1
5
1
1
1
1
2
1
1
2
2
1
1
1
1
5
1
1
1
1
1
4
1
1
1
1
1
2
1
1
5
4
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
2
6
1
1
1
1
1
1
1
1
1
1
2
1
1
3
1
4
1
4
1
2
1
1
2
2
1
1
1
1
1
1
1
1
3
1
1
2
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
3
1
1
1
2
1
1
1
1
1
1
1
1
1
2
1
1
1
1
4
1
2
1
2
2
4
1
1
1
1
1
1
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
4
2
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
1
1
1
1
1


In [None]:
with open("similarity_sets.pkl", 'wb') as f:
    pickle.dump(similarity, f)