In [None]:
import os
import pandas as pd
import networkx as nx
from matplotlib import pyplot as plt
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
files = os.listdir(r"/content/drive/MyDrive/MLNS/0301/")[:-1]

In [None]:
files

['0.txt', '1.txt', '2.txt', '3.txt']

In [None]:
columns = ["id", "uploader","age", "category","length","views","rate","ratings","comments"]

for i in range(1,21):
  columns.append(f"rec_{i}")

In [None]:
#we want to open any folder and save just 0.txt
df_list = []
failed = []
for i in tqdm(files):
    path = f"/content/drive/MyDrive/MLNS/0301/{i}"
    try:
        df = pd.read_csv(path, sep = '\t', header= None, names = columns)
    except:
        print(f"Error importing file:{i}")
        failed.append(i)
        continue
    df_list.append(df)

100%|██████████| 4/4 [00:04<00:00,  1.11s/it]


In [None]:
data = pd.concat(df_list)

# Data Exploration

In [None]:
data.head(2)

Unnamed: 0,id,uploader,age,category,length,views,rate,ratings,comments,rec_1,...,rec_11,rec_12,rec_13,rec_14,rec_15,rec_16,rec_17,rec_18,rec_19,rec_20
0,2rwktobtv9s,EA,742.0,Gadgets & Games,83.0,389536.0,2.65,2294.0,268.0,SQI9xPF9rdk,...,PmRHEQaCFsw,FyuYJsBavBs,VdHsMJRszck,uG1Q5LhqpsM,2rwktobtv9s,krT9Pjy9d8s,N4DdAIc_0tY,FkKWCBWVwQg,xXfmxQO2xz0,IqlxYO7YCI8
1,h6Ghupxbj9g,KB42PAH,742.0,Sports,28.0,276207.0,4.57,297.0,424.0,O1dXfikoYoQ,...,UZcvur6dBqM,D3PZXxx57-4,_-lcaXabZ8I,15VIqMXdKHU,8nTFFRtGb4M,1lYXY_eTNY8,bFhohWt2rAA,Hcb9w4KmQU4,KZ1dgZhOObc,rt-ytACeVM0


In [None]:
len(data.id.unique())

155513

In [None]:
len(data)

155513

In [None]:
data.index = data.id

In [None]:
data = data.drop("id", axis =1)

In [None]:
#there are some video for which we have just missing values
len(data[data.isnull().all(axis=1)])

696

In [None]:
#we keep just the video for which we have the information
data = data[data.notnull().any(axis=1)]

In [None]:
#sometime not all the recommendations are available
data.isna().sum()

uploader        0
age             0
category        0
length          0
views           0
rate            0
ratings         0
comments        0
rec_1        3219
rec_2        3786
rec_3        4212
rec_4        4578
rec_5        4895
rec_6        5169
rec_7        5414
rec_8        5685
rec_9        5926
rec_10       6255
rec_11       6501
rec_12       6769
rec_13       7010
rec_14       7185
rec_15       7376
rec_16       7599
rec_17       7814
rec_18       8106
rec_19       8738
rec_20      10277
dtype: int64

In [None]:
ids = data.index.tolist()

In [None]:
reccomendations_id = []
for i in tqdm(range(1,21)):
  id = data[f"rec_{i}"].unique().tolist()
  for n in id:
    reccomendations_id.append(n)


100%|██████████| 20/20 [00:01<00:00, 11.88it/s]


In [None]:
len(set(reccomendations_id).intersection(set(ids)))

154788

In [None]:
len(ids)

154817

In [None]:
data.insert(loc = 0,
          column = 'id',
          value = data.index)

In [None]:
data_list = data.values.tolist()

In [None]:
for i in set(data_list[0][9:]).intersection(set(ids)):
  print(i)

IqlxYO7YCI8
FkKWCBWVwQg
FyuYJsBavBs
2rwktobtv9s
VnLVtz4Vq18
krT9Pjy9d8s
U0raaoN6I6M
PmRHEQaCFsw
OW_Azt-ZFvI
2aDGS2ObyS8
VdHsMJRszck
vURuMxGC53A
xXfmxQO2xz0
N4DdAIc_0tY
1umiJrKfpdk
AYNFCy6hvFQ
SQI9xPF9rdk
uG1Q5LhqpsM
1gxK1e5MSYg
4q5jSGOcZb8


In [None]:
ids_set = set(ids)
edges = []
for i in tqdm(data_list):
  source = i[0]
  targets = set(i[9:]).intersection(ids_set)
  for t in targets:
    if t == t:
      edges.append([source, t])

100%|██████████| 154817/154817 [00:04<00:00, 37479.36it/s]


**NOTE**: if we plan to experiment on models that do not use nodes features we could also import all recommended video without checking if we have the information about it.

# Save all the datasets

In [None]:
#save initial dataset just for safety
data.to_csv("/content/drive/MyDrive/MLNS/youtube_raw.csv")

In [None]:
#convert edge list in dataframe
df_edges = pd.DataFrame(edges, columns=["from", "to"])
df_edges.head(2)

Unnamed: 0,from,to
0,2rwktobtv9s,IqlxYO7YCI8
1,2rwktobtv9s,FkKWCBWVwQg


In [None]:
df_edges.to_csv("/content/drive/MyDrive/MLNS/edges.csv", sep = "\t", index = False)

In [None]:
nodes = data.iloc[:,:9]
print(len(nodes))
nodes.head(2)


17365


Unnamed: 0,id,channel,age,category,length,views,rate,ratings,comments
0,LKh7zAJ4nwo,TheReceptionist,653.0,Entertainment,424.0,13021.0,4.34,1305.0,744.0
1,7D0Mf4Kn4Xk,periurban,583.0,Music,201.0,6508.0,4.19,687.0,312.0


# Create network

In [None]:
G = nx.from_pandas_edgelist(df_edges, 'from', 'to')

#retrieve largest connected component
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])

In [None]:
len(G0.nodes)

154677

In [None]:
len(list(nx.connected_components(G)))

7