In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import networkx as nx
import community.community_louvain as cl

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm

import numpy as np
import pandas as pd
import copy
import pickle

import time
from tqdm import tqdm

# Graph 불러오기

wheat는 모든 네트워크를 나타낸 그래프이다.

In [None]:
with open('/content/drive/Shareddrives/22-1 데이터마이닝/TermProject/Louvain_Analysis/pickles/wheat.pkl', 'rb') as f:
  wheat = pickle.load(f)

In [None]:
type(wheat['2021'])

networkx.classes.digraph.DiGraph

# Louvain community analysis (for all Countries)

community 탐지 방법으로 연결 밀도가 높은 집단끼리 서로 묶어서 분석하는 방법

일반적인 community detection의 경우 modularity 계산 시간이 오래 걸리기 때문에 실제로 다른 시각화 알고리즘을 구현하는 데 상당한 시간이 걸렸다.

Louvain Commnuity Analysis의 장점은 바로 빠른 속도에 있다.

## Sub Group to Images

Louvain 알고리즘에 따라 주어진 네트워크를 군집화해본다.

In [None]:
!pip install python-louvain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def Louvain(G):

    Graph = nx.to_undirected(G)
    partition = cl.best_partition(Graph, random_state=2022)

    ct = 1
    p_list = [ [] for i in range(10)]
    
    
    for country, i in partition.items():
        p_list[i].append(country)

        ct += 1
    
    return Graph, partition

def Louvain_Plot(Graph, partition, year) :
    
    pos = nx.kamada_kawai_layout(Graph, scale=10)
    cmap = cm.get_cmap('plasma', max(partition.values()) + 1)

    edges = Graph.edges()
    weights = np.array([Graph[u][v]['Trade Value (US$)'] for u, v in edges])    
    sizes = np.array(list(dict(Graph.out_degree(weight='Trade Value (US$)')).values())) + 1

    """ Visualization """
    plt.figure(figsize=(20,16))
    im = nx.draw_networkx_nodes(Graph, pos, partition.keys(), node_size=np.power(sizes, 1/4),
                        cmap=cmap, node_color=list(partition.values()))
    nx.draw_networkx_edges(Graph, pos, width=np.sqrt(weights)/1e4, alpha=0.4)
    nx.draw_networkx_labels(Graph, pos, font_size=8, font_color="white", bbox={'facecolor':'black','edgecolor':'black','boxstyle':'round','alpha':0.5})
    plt.colorbar(im)
    plt.title('Graph of {}'.format(year))
    # plt.show()
    plt.savefig("/content/drive/Shareddrives/22-1 데이터마이닝/TermProject/Louvain_Analysis/clusterIMG/{}_louvain_img.png".format(year),
                # dpi = 200, # 해상도
                bbox_inches='tight' # 
                )


분석은 2011년부터 2021년에 대해 진행해보고자 한다.

In [None]:
startYear = 2011
endYear = 2021

for year in tqdm(range(startYear, endYear+1), desc="Louvain cluster image making", mininterval=0.1):
    G, partition = Louvain(wheat[str(year)])
    # print(G)
    # print(partition)
    # print(wheat[str(year)].nodes())
    # print(list(partition.values()))
    Louvain_Plot(wheat[str(year)], partition, year)


Output hidden; open in https://colab.research.google.com to view.

## Making Sub Group Dataset

In [None]:
partition = {}
part_group = {}

for year in range(startYear, endYear+1):
  _, partition[str(year)] = Louvain(wheat[str(year)])

In [None]:
partial_groups = {}

for year in range(startYear, endYear+1):
  partial_groups[str(year)] = {}
  i = 0
  while True:
    if i in partition[str(year)].values():
      partial_groups[str(year)][str(i)] = [k for k, v in partition[str(year)].items() if v == i]
      i += 1
    else:
      print("{}년의 그룹 수 :".format(year) +str(i))
      partial_groups[str(year)]['group_number'] = i
      break
    
# type(partition['2012']['Bangladesh']) # int

2011년의 그룹 수 :5
2012년의 그룹 수 :6
2013년의 그룹 수 :5
2014년의 그룹 수 :6
2015년의 그룹 수 :5
2016년의 그룹 수 :5
2017년의 그룹 수 :5
2018년의 그룹 수 :6
2019년의 그룹 수 :4
2020년의 그룹 수 :4
2021년의 그룹 수 :6


In [None]:
print(type(partial_groups['2020']['group_number']))
print(partial_groups['2020']['1'])

<class 'int'>
['United Arab Emirates', 'Rep. of Moldova', 'Argentina', 'Cambodia', 'Ecuador', 'Ethiopia', 'Indonesia', 'Malaysia', 'New Zealand', 'Philippines', 'Rep. of Korea', 'Thailand', 'Viet Nam', 'Australia', 'Bahrain', 'Bangladesh', 'Brunei Darussalam', 'China', 'China, Hong Kong SAR', 'Egypt', 'Fiji', 'French Polynesia', 'India', 'Japan', 'Kuwait', 'Myanmar', 'New Caledonia', 'Oman', 'Other Asia, nes', 'Pakistan', 'Papua New Guinea', 'Qatar', 'Singapore', 'Solomon Isds', 'Sri Lanka', 'Yemen', 'Ukraine', 'Seychelles', "Côte d'Ivoire", 'Bhutan', 'Equatorial Guinea', 'Madagascar', 'Maldives', 'Sierra Leone', 'Somalia', 'Cook Isds', "Lao People's Dem. Rep.", 'Comoros', 'Eritrea']


wheat_data 만들어주기

In [None]:
# 전처리 완료했던 matrix를 불러옴
wheat_df = pd.read_csv("/content/drive/Shareddrives/22-1 데이터마이닝/TermProject/Data/comtradedata/df_2006_2021_1001.csv", index_col=False)
wheat_df.drop(['Unnamed: 0'], inplace=True, axis=1)
wheat_df.head()

Unnamed: 0,Year,From,To,Trade Value (US$)
0,2006,Algeria,Egypt,5452264
1,2006,Andorra,Lebanon,1
2,2006,"Areas, nes",Aruba,14580
3,2006,"Areas, nes",Côte d'Ivoire,339
4,2006,"Areas, nes",Ireland,1575677


In [None]:
wheat_data = {}

startYear = 2011
endYear = max(wheat_df['Year'])
print(f'Dataset from {startYear} to {endYear}\n')

for year in range(startYear, endYear+1):
    wheat_data[ str(year) ] = wheat_df.loc[wheat_df['Year'] == year].copy()
    print(year, '\t', wheat_data[ str(year) ].shape)

wheat_data['2017'] # example

Dataset from 2011 to 2021

2011 	 (1665, 4)
2012 	 (1630, 4)
2013 	 (1697, 4)
2014 	 (1690, 4)
2015 	 (1849, 4)
2016 	 (2193, 4)
2017 	 (2257, 4)
2018 	 (2164, 4)
2019 	 (2219, 4)
2020 	 (2107, 4)
2021 	 (1553, 4)


Unnamed: 0,Year,From,To,Trade Value (US$)
20532,2017,Afghanistan,Slovakia,7
20533,2017,Albania,France,88214
20534,2017,Algeria,France,16745
20535,2017,Algeria,Switzerland,374
20536,2017,Algeria,United Arab Emirates,14
...,...,...,...,...
22784,2017,Yemen,United Arab Emirates,20774
22785,2017,Zambia,Malawi,3139
22786,2017,Zimbabwe,"Areas, nes",5280000
22787,2017,Zimbabwe,South Africa,19


In [None]:
df = {} # temporary usage
sub_group_wheat = {}
true_idx = {}

for year in tqdm(range(startYear, endYear+1), desc="sub group에 대한 DF정보 담긴 Nested Dictionary 생성"):
  
  # year별 나눠진 그룹 수
  max_iter = partial_groups[str(year)]['group_number']

  # nested dictionary
  df[str(year)] = {}
  
  sub_group_wheat[str(year)] = {}

  true_idx[str(year)] = {}

  # for every ith group
  for i in range(max_iter):
    # idx를 받아올 DataFrame 지정
    df[str(year)][str(i)] = wheat_data[str(year)].copy()
    
    # From과 To가 모두 group list에 있는 국가인 index를 담는 리스트
    true_idx[str(year)][i] = []

    # read DataFrame for every idx, row
    for idx, row in df[str(year)][str(i)].iterrows():

      # boolean of presence
      in_From = row['From'] in partial_groups[str(year)][str(i)]
      in_To   = row['To'] in partial_groups[str(year)][str(i)]
      
      # condition
      if (in_From == True) and (in_To == True):
        true_idx[str(year)][i].append(idx)
    # print(true_idx[str(year)][i]) # true index
    # df[str(year)][str(i)] = df[str(year)][str(i)].loc[true_idx[str(year)][i],:]
    sub_group_wheat[str(year)][str(i)] = copy.deepcopy(df[str(year)][str(i)].loc[true_idx[str(year)][i],:])
    # sub_group_wheat[str(year)][str(i)] = copy.deepcopy(df[str(year)][str(i)])


sub group에 대한 DF정보 담긴 Nested Dictionary 생성: 100%|██████████| 11/11 [00:12<00:00,  1.13s/it]


In [None]:
sub_group_wheat['2021']['1']

Unnamed: 0,Year,From,To,Trade Value (US$)
29284,2021,"Areas, nes",Armenia,120460
29286,2021,"Areas, nes",Croatia,16197
29287,2021,"Areas, nes",Netherlands,9286297
29288,2021,"Areas, nes",Rep. of Moldova,175140
29303,2021,Armenia,Georgia,1192053
...,...,...,...,...
30479,2021,Spain,Iraq,575277
30480,2021,Spain,Italy,15657312
30482,2021,Spain,Luxembourg,127413
30485,2021,Spain,Netherlands,1299462


In [None]:
sub_group_wheat['2021']['2']

Unnamed: 0,Year,From,To,Trade Value (US$)
29282,2021,Algeria,United Kingdom,1622
29384,2021,Belarus,Denmark,277300
29385,2021,Belarus,Kazakhstan,6100
29386,2021,Belarus,Lithuania,8318203
29387,2021,Belarus,Norway,1573400
...,...,...,...,...
30804,2021,United Kingdom,Poland,2931
30805,2021,United Kingdom,Portugal,16511036
30808,2021,United Kingdom,Saudi Arabia,5904
30811,2021,United Kingdom,South Africa,1239


## to pickle

In [None]:
# # 각 Cluster별 데이터를 담고 있는 sub_group_wheat 객체를 저장 : memorize 목적
# with open('/content/drive/Shareddrives/22-1 데이터마이닝/TermProject/Louvain_Analysis/pickles/sub_group_wheat.pkl', 'wb') as f:
#   pickle.dump(sub_group_wheat, f)

In [None]:
# with open('/content/drive/Shareddrives/22-1 데이터마이닝/TermProject/Louvain_Analysis/pickles/partial_groups.pkl', 'wb') as f:
#   pickle.dump(partial_groups, f)