# Handling Json Data (Basic)

- 주제: 크롤링된 인별 데이터를 정제해보자.
- 작성 날짜: 2018-07-09
- 수정 날짜: 2018-07-09
- 작성자: 부현경 (hyunkyung.boo@gmail.com)

In [1]:
import json 
import pandas as pd 
from pandas.io.json import json_normalize #package for flattening json in pandas df
import os
import re


# 로드할 파일 목록 불러오기
def getPath(file_path, sign):
    file_path_list = []
    for (file_path, dir, files) in os.walk(path):
        for filename in files:
            ext = os.path.splitext(filename)[-1]
            if ext == sign:
                file_path_list.append(str("%s\\%s" % (file_path, filename)))
    return file_path_list
            

# 검색 태그명 추출하기
def getKey(string):
    tmp = string.split('\\')
    result = tmp[len(tmp)-1].replace('.json', '')
    return result


# 딕셔너리로 구성된 데이터 중 키 값을 이용해 사용할 데이터(키-값)만을 추출하는 함수
def extractItems(dict_data, keys):
    new_dict = {}
    for i in dict_data.keys():
        if i in keys:
            new_dict[i] = dict_data[i]
    return new_dict

In [None]:
path = "C:\\Users\\User\Desktop\\InstagramCroller"
json_path_list = getPath(path, '.json')
#print(1, json_path_list)

All_df = pd.DataFrame()
tag_count_list = []
for path in json_path_list:
    
    # path = "C:\\Users\\User\Desktop\\InstagramCroller\\공연.json"
    with open(path, encoding='UTF8') as f:

        # 1. load .Json
        data = json.load(f)

        # 2. Extract data (What will we use?)
        keys = ['count', 'edges']
        j1 = extractItems(data, keys)
        
        # 3. Normalize Json format
        j2 = json_normalize(j1['edges'])

        # 4. Drop columns that dont be needed and then rename columns
        tmp = set(j2.columns)
        fixed = set(['node.display_url', 'node.edge_liked_by.count', 'node.edge_media_to_caption.edges', 
                     'node.edge_media_to_comment.count', 'node.id', 'node.owner.id', 'node.taken_at_timestamp'])
        removeColumnsSet = tmp - fixed
        j2.drop(list(removeColumnsSet), axis = 1, inplace = True)
        j2.columns = ['post_url', 'like_count', 'text', 'comment_count', 'post_id', 'user_id', 'time']

        # 5. Extracing tags form text (use regex)
        tag_list = []
        for row in j2['text']:
            if bool(row) == False:
                tag_list.append("null+nan+none")
            #print(111, index)
            else:
                for index in row:
                    row = str(index['node']['text'])
                    if row.find("#") > -1:
                        p = re.compile('(#\w*)')
                        tag_from_row = p.findall(row)
                        tag_from_row = [x.strip('#') for x in tag_from_row]
                        row = ' '.join(tag_from_row)
                        tag_list.append(row)
                    else:
                        tag_list.append("null+nan+none")
            
        # 6. Create new columns and insert tag list into created columns
        j2['tags'] = tag_list
        # print(1, len(tag_list), len(j2), len(j2['text']))

    tag_count_list.append([getKey(path), j1['count']])
    All_df = All_df.append(j2)

searching_tag_df = pd.DataFrame(tag_count_list, columns=['tag', 'count'])
print(searching_tag_df, searching_tag_df.shape)
print(All_df.tail(10),  All_df.shape)

# Convert unix timestamp to this format of year/month/day/hh/mm/ss
result_s = pd.to_datetime(All_df['time'],unit='s')
All_df['time'] = result_s

In [5]:
# 데이터 저장
save_path = "D:\\defined_json_insta.xlsx"
writer = pd.ExcelWriter(save_path)
All_df.to_excel(writer, 'combine_all_tags', header=True, index=False)
searching_tag_df.to_excel(writer, 'tag+total_count', header=True, index=False)
writer.save()