### Recommend to use version 3 for typical DataFrame data set

## Simple exercise 1

In [4]:
# pip install pandas

Collecting pandas
  Downloading pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pytz>=2020.1
  Downloading pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.4/499.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting numpy>=1.21.0
  Using cached numpy-1.24.1-cp311-cp311-macosx_11_0_arm64.whl (13.8 MB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.24.1 pandas-1.5.3 pytz-2022.7.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
import pandas as pd
import numpy as np

In [6]:

from itertools import combinations

df = pd.DataFrame({
        'Col1': [['Green','Red','Purple'], 
                 ['Red', 'Yellow', 'Blue'], 
                 ['Brown', 'Green', 'Yellow', 'Blue']]
     }, index=['A', 'B', 'C'])

df['Col1'] = df['Col1'].apply(set)    
df


Unnamed: 0,Col1
A,"{Red, Purple, Green}"
B,"{Red, Blue, Yellow}"
C,"{Blue, Yellow, Green, Brown}"


In [7]:
df1 = pd.DataFrame(
    data=list(combinations(df.index.tolist(), 2)), 
    columns=['Src', 'Dst'])

df1

Unnamed: 0,Src,Dst
0,A,B
1,A,C
2,B,C


In [8]:
df1['Weights'] = df1.apply(lambda x: len(
    df.loc[x['Src']]['Col1'].intersection(df.loc[x['Dst']]['Col1'])), axis=1)
df1


Unnamed: 0,Src,Dst,Weights
0,A,B,1
1,A,C,1
2,B,C,2


In [13]:
i

array([0, 0, 1])

In [11]:
c = df.Col1.apply(set).values

i, j = np.triu_indices(c.size, 1)

pd.DataFrame(dict(
        Source=df.index[i],
        Target=df.index[j],
        Weight=[len(s) for s in c[i] & c[j]]
    ))

Unnamed: 0,Source,Target,Weight
0,A,B,1
1,A,C,1
2,B,C,2


## Simple Exercise 2

In [15]:
df = pd.DataFrame([['Apple', 'Orange', 'Peach'],
                   ['Apple', 'Lemon', 'Lime'],
                   ['Starfruit', 'Apple', 'Orange']],
                  columns=['Fruit_1', 'Fruit_2', 'Fruit_3'])
df

Unnamed: 0,Fruit_1,Fruit_2,Fruit_3
0,Apple,Orange,Peach
1,Apple,Lemon,Lime
2,Starfruit,Apple,Orange


In [21]:
def fast_combinations(row : list, self_loops = False) -> np.array:        
    try:
        if self_loops:
            comb = np.unique(np.sort(np.array(np.meshgrid(row, row)).T.reshape(-1,2)), axis=0)
        else:
            comb = np.unique(np.sort(np.array(np.meshgrid(row, row)).T.reshape(-1,2)), axis=0)
            comb = np.delete(comb, np.where(comb[:,0] == comb[:,1]), axis=0)
        return comb
    except:
        return [[None, None]]

def get_edgelist(df, **kwargs):
    cols = df.columns
    df['combined'] = df[df.columns].values.tolist()
    # Clear space
    df.drop(cols, axis=1, inplace=True)
    arrays = []
    for row in range(len(df.index)):
        arrays.append(fast_combinations(df.loc[row, 'combined'], kwargs))
    return pd.DataFrame(np.concatenate( arrays, axis=0 ), columns=['ID1', 'ID2']).replace('nan', None).dropna().reset_index(drop=True)

In [22]:
edgelist = get_edgelist(df)

In [24]:
df

Unnamed: 0,combined
0,"[Apple, Orange, Peach]"
1,"[Apple, Lemon, Lime]"
2,"[Starfruit, Apple, Orange]"


In [23]:
edgelist

Unnamed: 0,ID1,ID2
0,Apple,Orange
1,Apple,Peach
2,Orange,Peach
3,Apple,Lemon
4,Apple,Lime
5,Lemon,Lime
6,Apple,Orange
7,Apple,Starfruit
8,Orange,Starfruit


## Simple Exercise 3

In [41]:
df2 = pd.DataFrame({'year':['jun2020', 'jun2020', 'jun2020','jul2020', 'jul2020', 'jul2020'],
'value':['a','b','c','a','b','c']})
df2

Unnamed: 0,year,value
0,jun2020,a
1,jun2020,b
2,jun2020,c
3,jul2020,a
4,jul2020,b
5,jul2020,c


In [42]:
from itertools import combinations

def get_combinations(group):
    return pd.DataFrame([sorted(e) for e in list(combinations(group['value'].values, 2))], columns=['node1', 'node2'])

df2 = df2.groupby('year').apply(get_combinations)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,node1,node2
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
jul2020,0,a,b
jul2020,1,a,c
jul2020,2,b,c
jun2020,0,a,b
jun2020,1,a,c
jun2020,2,b,c


In [43]:
df2 = df2.groupby(['node1', 'node2']).size().to_frame('weight').reset_index()
df2

Unnamed: 0,node1,node2,weight
0,a,b,2
1,a,c,2
2,b,c,2
