In [1]:
import pandas as pd
import glob

In [None]:
# 讀取 gmap_data(99).csv
df = pd.read_csv('gmap838_reviews.csv')

# 確保 'language' 欄位存在，然後分組統計
grouped = df.groupby('language')

for language, group in grouped:
    # 統計 'user_id' 的出現次數
    user_counts = group['user_id'].value_counts().reset_index()
    
    # 重命名欄位
    user_counts.columns = ['user_id', 'ncomments']
    
    # 按 'ncomments' 降冪排序
    user_counts_sorted = user_counts.sort_values(by='ncomments', ascending=False)
    
    # 輸出為新的 CSV 檔案
    output_filename = f"{language}_user_comment_counts.csv"
    user_counts_sorted.to_csv(output_filename, index=False)


In [1]:
import pandas as pd
import glob
import os

# 定義檔案路徑
file_path = "./CSV/user_count/*.csv"

# 搜尋所有語言對應的統計檔案
files = glob.glob(file_path)

# 統計結果存儲
stats = []

# 讀取每個檔案並提取統計數據
for file in files:
    filename = os.path.basename(file)  # 取得檔案名稱
    language = filename.rsplit("_", 3)[0]  # 取得語言名稱
    df = pd.read_csv(file, usecols=['user_id', 'ncomments'])
    
    # 確保 'ncomments' 欄位存在且有足夠的資料
    df_sorted = df.sort_values(by='ncomments', ascending=False)
    ranks = [1, 25, 50, 100]
    values = {rank: int(df_sorted['ncomments'].iloc[rank - 1]) if len(df_sorted) >= rank and not pd.isna(df_sorted['ncomments'].iloc[rank - 1]) else None for rank in ranks}
    
    stats.append({
        "language": language,
        "1st": values[1],
        "25th": values[25],
        "50th": values[50],
        "100th": values[100]
    })

# 轉換為 DataFrame
stats_df = pd.DataFrame(stats)

# 確保所有數值列為整數類型，填充 None 為 0
stats_df = stats_df.fillna(0).astype({"1st": "int", "25th": "int", "50th": "int", "100th": "int"})

# 輸出為 CSV
output_filename = "language_comment_statistics.csv"
stats_df.to_csv(output_filename, index=False)

# 顯示結果
print(stats_df)
print(f"統計結果已輸出至 {output_filename}")

   language  1st  25th  50th  100th
0        af    1     0     0      0
1        ar    7     1     1      1
2        az    1     0     0      0
3        bg    2     1     0      0
4        bn    1     0     0      0
..      ...  ...   ...   ...    ...
94       yi    2     0     0      0
95       yo    1     0     0      0
96  zh-Hant  293   142   127    106
97       zh   71    29    26     22
98       zu    1     0     0      0

[99 rows x 5 columns]
統計結果已輸出至 language_comment_statistics.csv
