In [1]:
import os
import json

# Step 1: List all JSON files in the folder
folder_path = r"E:\COLLEGE\PROJECT\designer-fashion-industry-analysis\temp_crawling_results"
json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]

# Step 2: Merge all JSON files into one dictionary
merged_data = {}
for file in json_files:
    with open(os.path.join(folder_path, file), 'r') as f:
        data = json.load(f)
        for key, value in data.items():
            if key in merged_data:
                merged_data[key].append(value)
            else:
                merged_data[key] = [value]

# Step 3: Save merged data to a new JSON file
output_folder = r'E:\COLLEGE\PROJECT\designer-fashion-industry-analysis\Crawling\data_for_analysis'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file_path = os.path.join(output_folder, 'detail_info_all.json')
with open(output_file_path, 'w') as f:
    json.dump(merged_data, f, indent=4)

print("Merged data saved to:", output_file_path)

Merged data saved to: E:\COLLEGE\PROJECT\designer-fashion-industry-analysis\Crawling\data_for_analysis\detail_info_all.json


In [2]:
import os
import pandas as pd
import json

# Step 1: List all JSON files in the folder
folder_path = r"E:\COLLEGE\PROJECT\designer-fashion-industry-analysis\temp_crawling_results"
json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]

# Step 2: Extract all unique keys from product_info dictionaries
all_keys = set()
for file in json_files:
    with open(os.path.join(folder_path, file), 'r') as f:
        data = json.load(f)
        all_keys.update([key.strip() for key in data['product_info'].keys()])

all_keys = list(all_keys)

rows = []

# Step 4: Fill in list with data from JSON files
for file in json_files:
    with open(os.path.join(folder_path, file), 'r') as f:
        row = {}
        data = json.load(f)
        product_info = data['product_info']
        current_prod_info_keys = [key.strip() for key in product_info.keys()]
        for key in all_keys:
            if key in current_prod_info_keys:
                row[key] = product_info[key]
            else:
                row[key] = None
            
        rows.append(row)
            

# Step 5: Create DataFrame from the list of rows
product_info_df = pd.DataFrame(rows)
output_folder = r'E:\COLLEGE\PROJECT\designer-fashion-industry-analysis\Crawling\data_for_analysis'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
excel_file_path = os.path.join(output_folder, 'product_info.xlsx')
product_info_df.to_excel(excel_file_path, index=False, sheet_name='product_info')
product_info_df.head(3)

Unnamed: 0,Brand,Dress/Skirt Style,Dress/Skirt Length,KÍCH THƯỚC,Crossed Price,Cropped Top,Description,Country of Origin,MÀU SẮC,Stock,...,Sold,Image List,Sleeve Length,Pattern,Date Crawled,Neckline,Kích Thước,Platform,Quantity,Percent Discount
0,ELISE,,,,₫1.798.000,,"Sản phẩm được sử dụng chất liệu cao cấp, độ bề...",Vietnam,,62,...,8,{'001_Aó khoác đen trần bông quả trám phối lụa...,,,2024-04-19,,"[L, M, S, XL]",Shopee,"[Quantity 1, 62 pieces available]",60% OFF
1,ELISE,,,,₫898.000,,"Sản phẩm được sử dụng chất liệu cao cấp, độ bề...",Vietnam,,117,...,20,{'001_Body suit trắng khóa cổ thiết kế Elise F...,Short Sleeves,,2024-04-19,,"[L, M, S, XL]",Shopee,"[Quantity 1, 117 pieces available]",60% OFF
2,ELISE,,,,₫898.000,,"Sản phẩm được sử dụng chất liệu cao cấp, độ bề...",Vietnam,,168,...,0,{'001_Body suit vàng dài tay xoắn ngực thiết k...,Long Sleeves,Plain,2024-04-19,,"[L, M, S, XL]",Shopee,"[Quantity 1, 168 pieces available]",60% OFF


In [3]:
product_info_df.columns

Index(['Brand', 'Dress/Skirt Style', 'Dress/Skirt Length', 'KÍCH THƯỚC',
       'Crossed Price', 'Cropped Top', 'Description', 'Country of Origin',
       'MÀU SẮC', 'Stock', 'Bottoms Length', 'Bottoms Fit Type', 'Ships From',
       'Rating Count', 'Màu Sắc', 'Average Rating', 'Bottoms Style',
       'Current Price', 'Category', 'Product Title', 'Material', 'Sold',
       'Image List', 'Sleeve Length', 'Pattern', 'Date Crawled', 'Neckline',
       'Kích Thước', 'Platform', 'Quantity', 'Percent Discount'],
      dtype='object')

In [4]:
product_info_df.columns

Index(['Brand', 'Dress/Skirt Style', 'Dress/Skirt Length', 'KÍCH THƯỚC',
       'Crossed Price', 'Cropped Top', 'Description', 'Country of Origin',
       'MÀU SẮC', 'Stock', 'Bottoms Length', 'Bottoms Fit Type', 'Ships From',
       'Rating Count', 'Màu Sắc', 'Average Rating', 'Bottoms Style',
       'Current Price', 'Category', 'Product Title', 'Material', 'Sold',
       'Image List', 'Sleeve Length', 'Pattern', 'Date Crawled', 'Neckline',
       'Kích Thước', 'Platform', 'Quantity', 'Percent Discount'],
      dtype='object')

In [5]:
product_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488 entries, 0 to 487
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Brand               488 non-null    object
 1   Dress/Skirt Style   28 non-null     object
 2   Dress/Skirt Length  80 non-null     object
 3   KÍCH THƯỚC          1 non-null      object
 4   Crossed Price       484 non-null    object
 5   Cropped Top         11 non-null     object
 6   Description         488 non-null    object
 7   Country of Origin   488 non-null    object
 8   MÀU SẮC             1 non-null      object
 9   Stock               488 non-null    object
 10  Bottoms Length      17 non-null     object
 11  Bottoms Fit Type    2 non-null      object
 12  Ships From          488 non-null    object
 13  Rating Count        275 non-null    object
 14  Màu Sắc             486 non-null    object
 15  Average Rating      275 non-null    object
 16  Bottoms Style       8 non-

In [6]:
product_info_df[['Product Title', 'Current Price']][product_info_df['Current Price'].isnull()]

Unnamed: 0,Product Title,Current Price


In [7]:
product_info_df['Sold'] = product_info_df['Sold'].astype(int)

In [8]:
product_info_df['Sold'].value_counts(bins=10)

(-0.163, 16.2]    347
(16.2, 32.4]       53
(32.4, 48.6]       34
(48.6, 64.8]       14
(81.0, 97.2]       13
(64.8, 81.0]        8
(97.2, 113.4]       8
(145.8, 162.0]      6
(113.4, 129.6]      5
(129.6, 145.8]      0
Name: count, dtype: int64

In [9]:
product_info_df['Pattern'].value_counts()

Pattern
Plain                         66
Floral                        41
Striped                       14
Polka Dotted                   8
Checkered / Plaid              4
Checkered / Plaid, Striped     2
Plain, Striped                 2
Floral, Plain                  2
Polka Dotted, Striped          1
Print                          1
Floral, Print                  1
Name: count, dtype: int64

In [10]:
import re

product_info_df['Current Price'] = product_info_df['Current Price'].fillna('')
product_info_df['Current Price'] = product_info_df['Current Price'].str.replace(r'₫','').str.replace('.','')
product_info_df['Current Price'] = pd.to_numeric(product_info_df['Current Price'])

In [11]:
numbers_intervals = []
display(product_info_df['Current Price'].value_counts(bins=6).sort_index())
print(len(product_info_df))

(197399.69900000002, 499250.0]    130
(499250.0, 799300.0]              249
(799300.0, 1099350.0]              86
(1099350.0, 1399400.0]             19
(1399400.0, 1699450.0]              1
(1699450.0, 1999500.0]              3
Name: count, dtype: int64

488


In [12]:
numbers_intervals = []
display(product_info_df['Sold'].value_counts(bins=7).sort_index())
print(len(product_info_df))

(-0.163, 23.143]      375
(23.143, 46.286]       57
(46.286, 69.429]       19
(69.429, 92.571]       14
(92.571, 115.714]      13
(115.714, 138.857]      4
(138.857, 162.0]        6
Name: count, dtype: int64

488


In [13]:
product_info_df['lowest_cat'] = product_info_df['Category'].str.split('>').str.get(-1)
num_products_each_cat = product_info_df.groupby(['Category','lowest_cat']).size().reset_index()
num_products_each_cat.columns = ['Category','lowest cat', 'prod num']
display(num_products_each_cat)
num_products_each_cat.to_excel('..\data_for_analysis\Lượng sản phẩm mỗi Category.xlsx',sheet_name='prods each cat', index=False, header=True)

Unnamed: 0,Category,lowest cat,prod num
0,Shopee>Women Clothes>Dresses,Dresses,132
1,"Shopee>Women Clothes>Jackets, Coats & Vests>Capes",Capes,1
2,"Shopee>Women Clothes>Jackets, Coats & Vests>Ja...",Jackets,4
3,"Shopee>Women Clothes>Jackets, Coats & Vests>Vests",Vests,34
4,"Shopee>Women Clothes>Jackets, Coats & Vests>Wi...",Winter Jackets & Coats,2
5,"Shopee>Women Clothes>Jumpsuits, Playsuits & Ov...",Jumpsuits,9
6,Shopee>Women Clothes>Pants & Leggings>Pants,Pants,45
7,Shopee>Women Clothes>Sets>Individual Sets,Individual Sets,11
8,Shopee>Women Clothes>Shorts>Shorts,Shorts,27
9,Shopee>Women Clothes>Shorts>Skorts,Skorts,1
