In [40]:
# 會用到的套件
import pandas as pd                        
import matplotlib.pyplot as plt             
import matplotlib.font_manager as fm
from matplotlib.colors import Colormap      
from wordcloud import WordCloud             
import jieba   
import jieba.analyse
import re                            
import cn2an    # 阿拉伯數字、中文數字互換
%matplotlib inline

In [41]:
# plt.rcParams['font.sans-serif']=['SimHei'] #使 matplotilb 中文能正常顯示
fprop = fm.FontProperties(fname="./raw_data/SourceHanSansTW-Regular.otf")

# 設定整份 notebook 圖片的長寬與 dpi
plt.rcParams["figure.figsize"] = [6, 4]
plt.rcParams["figure.dpi"] = 150

## 資料前處理
資料來源：
+ TarFlow 蒐集PTT 房屋版 文章
+ 關鍵字：房價
+ 時間： 2024-03-01 ~ 2025-3-01
+ 資料筆數：共 5195 篇文章

資料介紹

In [42]:
df = pd.read_csv('raw_data/ptt_house_price.csv')
df["artDate"] = pd.to_datetime(df["artDate"])
df['artDate'] = df['artDate'].dt.date 

df.head()

Unnamed: 0,system_id,artUrl,artTitle,artDate,artPoster,artCatagory,artContent,artComment,e_ip,insertedDate,dataSource
0,1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,jump0517,home_sale,連結：\nhttps://money.udn.com/money/story/5621/78...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""fatpigbgman""...",1.168.65.178,2024-03-02 01:42:37,ptt
1,2,https://www.ptt.cc/bbs/home-sale/M.1709251729....,[情報]好房網2月房價,2024-03-01,linlinme5208,home_sale,好房網 重點縣市成交漲跌排行\nhttps://price.housefun.com.tw/...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""ChennBZ"", ""c...",211.20.144.117,2024-03-02 01:42:37,ptt
2,3,https://www.ptt.cc/bbs/home-sale/M.1709255830....,Re:[情報]好房網2月房價,2024-03-01,junior020486,home_sale,你們那個要買新竹熱區的手腳要快\n新竹現在很誇張\n外圍補漲的很兇\n前幾天星XX超開價成交...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""hellogym"", ""...",49.216.24.218,2024-03-02 01:42:37,ptt
3,4,https://www.ptt.cc/bbs/home-sale/M.1709260904....,[閒聊]基隆是台北之外最不可能蓋GG的嗎,2024-03-01,Austenite,home_sale,最近各縣市都被點名要蓋GG\n高雄 台中 嘉義 虎尾\n被點名出列的房價都開始漲\n\n基隆...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""wwrest"", ""cm...",1.163.246.7,2024-03-02 01:42:37,ptt
4,5,https://www.ptt.cc/bbs/home-sale/M.1709263369....,[請益]預售屋可否負擔,2024-03-01,mosfets,home_sale,各位前輩好\n\n小弟目前在竹科工作 覺得新竹房價越來越高 因為工作快三年 有些自備款 有想...,"[{""cmtStatus"": ""推"", ""cmtPoster"": ""JohnLackey"",...",39.9.32.99,2024-03-02 01:42:37,ptt


資料清理

In [None]:
clear_df = df.copy()

#去除本次不會用到的欄位
drop_cols = ['system_id', 'artPoster', 'artCatagory', 'artComment', 'e_ip', 'insertedDate', 'dataSource']
clear_df.drop(drop_cols, axis = 1, inplace = True) # 若inplace=False會回傳新的Dataframe，需要賦值接它

#去除文章內容為空值的筆數
clear_df.dropna(subset = ['artContent'], axis=0, how='any', inplace=True)

#新增['sentence']欄位，用'。'取代'\n\n'，並移除'\n'
clear_df['sentence'] = clear_df['artContent'].str.replace(r'\n\n','。', regex=True)
clear_df['sentence'] = clear_df['sentence'].str.replace(r'\n','，', regex=True)

#移除內文中的網址
clear_df['sentence'] = clear_df['sentence'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

clear_df.head()

Unnamed: 0,artUrl,artTitle,artDate,artContent,sentence
0,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,連結：， 駁斥養地炒房說。媒體報導，具公股色彩的台灣金聯近年大舉在南台灣與建商競標土地，得標...
1,https://www.ptt.cc/bbs/home-sale/M.1709251729....,[情報]好房網2月房價,2024-03-01,好房網 重點縣市成交漲跌排行\nhttps://price.housefun.com.tw/...,好房網 重點縣市成交漲跌排行， ->區域實價登錄趨勢。2024年。2月，重點縣市實價登錄漲跌...
2,https://www.ptt.cc/bbs/home-sale/M.1709255830....,Re:[情報]好房網2月房價,2024-03-01,你們那個要買新竹熱區的手腳要快\n新竹現在很誇張\n外圍補漲的很兇\n前幾天星XX超開價成交...,你們那個要買新竹熱區的手腳要快，新竹現在很誇張，外圍補漲的很兇，前幾天星XX超開價成交後，環...
3,https://www.ptt.cc/bbs/home-sale/M.1709260904....,[閒聊]基隆是台北之外最不可能蓋GG的嗎,2024-03-01,最近各縣市都被點名要蓋GG\n高雄 台中 嘉義 虎尾\n被點名出列的房價都開始漲\n\n基隆...,最近各縣市都被點名要蓋GG，高雄 台中 嘉義 虎尾，被點名出列的房價都開始漲。基隆484永遠...
4,https://www.ptt.cc/bbs/home-sale/M.1709263369....,[請益]預售屋可否負擔,2024-03-01,各位前輩好\n\n小弟目前在竹科工作 覺得新竹房價越來越高 因為工作快三年 有些自備款 有想...,各位前輩好。小弟目前在竹科工作 覺得新竹房價越來越高 因為工作快三年 有些自備款 有想先上車...


斷句

In [44]:
clear_df['sentence'] = clear_df['sentence'].str.split("[,，。！!？?]{1,}")

sent_df = clear_df.explode('sentence').reset_index(drop=True)

sent_df.head()

Unnamed: 0,artUrl,artTitle,artDate,artContent,sentence
0,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,連結：
1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,駁斥養地炒房說
2,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,媒體報導
3,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,具公股色彩的台灣金聯近年大舉在南台灣與建商競標土地
4,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,得標後不開發


斷詞

In [45]:
# 初始化斷詞引擎：不一定要設定，只使用內建效果也可以
# 參考網址：https://raw.githubusercontent.com/ldkrsi/jieba-zh_TW/master/jieba/dict.txt
jieba.set_dictionary('dict/dict.txt')

#去除句子中除了字母、數字、空白字符以外的符號
sent_df['sentence'] = sent_df['sentence'].str.replace(r'[^\w\s]+', '', regex=True).astype(str)

# 把長度小於1的sentence刪掉
sent_df = sent_df[sent_df["sentence"].str.len() > 1]

# 更新斷詞字典，"萬元"、"年"、"坪"前面會有數字或文字，形成組合詞，如:30萬元、三十坪...
# 正規表達式匹配
pattern = r'([\d零一二三四五六七八九十百千萬億]+萬元)'
pattern2 = r'([\d零一二三四五六七八九十百]+年)'
pattern3 = r'([\d零一二三四五六七八九十百]+坪)'

# 找出所有符合條件的詞，並存入 set（除去重複詞）
# .sum() -> 把apply()的結果展開
custom_words = set(sent_df["sentence"]
                   .apply(lambda x: re.findall(pattern, x))
                   .sum()).union(set(sent_df["sentence"].apply(lambda x: re.findall(pattern2, x)).sum())).union(set(sent_df["sentence"].apply(lambda x: re.findall(pattern3, x)).sum()))

# 存入斷詞字典
for word in custom_words:
    jieba.add_word(word)

# 斷詞
word_df = sent_df.assign(word = sent_df['sentence'].apply(jieba.lcut)).explode('word').drop(['sentence'], axis=1)

# 去除空值
word_df = word_df.dropna(subset=['word'])

# 把長度小於1的word刪掉
word_df = word_df.loc[word_df['word'].str.len() > 1]

word_df.head()

Building prefix dict from e:\中山資管碩\碩一下學期\社媒\2025-SMA-Study-Group-8\dict\dict.txt ...
Loading model from cache C:\Users\paul\AppData\Local\Temp\jieba.u3c145057bb1ef3916e65da93bba4808e.cache
Loading model cost 0.797 seconds.
Prefix dict has been built successfully.


Unnamed: 0,artUrl,artTitle,artDate,artContent,word
0,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,連結
1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,駁斥
1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,養地
1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,炒房
2,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,媒體


移除停用字

In [46]:
# 以檔案形式匯入通用的停用字檔案
# 將stopwords存成list
stopwords = [line.strip() for line in open('dict/stopwords.txt', 'r', encoding='utf-8').readlines()] 

# 手動加入停用字
stopwords_manual = ["今年", "去年", "近年", "數年", "每年", "半年", "連結", "現在", "目前", "認為", "持續", "一直", "時間", "這種", "表示", "看到", 
                    "知道", "字頭", "根本", "地方", "平均", "區域", "價格", "一堆", "指出", "直接", "萬坪"] 
stopwords.extend(stopwords_manual)

# 去除停用字
noStop_df = word_df[~word_df['word'].isin(stopwords)]

noStop_df.head()

Unnamed: 0,artUrl,artTitle,artDate,artContent,word
1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,駁斥
1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,養地
1,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,炒房
2,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,媒體
2,https://www.ptt.cc/bbs/home-sale/M.1709249161....,[新聞]台灣金聯：土地未運用有原因駁斥養地炒,2024-03-01,連結：\nhttps://money.udn.com/money/story/5621/78...,報導


數字格式統一：阿拉伯數字轉成中文數字

In [47]:
noStop_df["word"] = noStop_df["word"].apply(lambda x: cn2an.transform(x, "an2cn"))

noStop_df.describe()

  warn(str(e))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noStop_df["word"] = noStop_df["word"].apply(lambda x: cn2an.transform(x, "an2cn"))


Unnamed: 0,artUrl,artTitle,artDate,artContent,word
count,633224,633224,633224,633224,633224
unique,4993,4130,364,4993,59554
top,https://www.ptt.cc/bbs/home-sale/M.1730888537....,Re:[轉錄]發生戰事你各位房貸都不用還了還不,2024-09-21,作者: monnom (桂) 看板: Gossiping\n標題: [新聞] 鳥籠小宅時代來...,房價
freq,2303,2701,7295,2303,9355
