# Technical Paper:
# Text Mining and Document Classification Workflows for Chinese Administrative Documents

## File 3 - Vectorization (fastText)

This file tokenizes the data with Meta's fastText embeddings.

## 1. Load packages and data

In [1]:
import os # connection to OS
import pandas as pd # data manipulation
import numpy as np
from ast import literal_eval # 

# Fasttext model
import gensim
from gensim.models import FastText

Python version: 3.8.18 (in Windows Subsystem for Linux)

In [2]:
# versions
np.__version__, pd.__version__

('1.24.3', '2.0.3')

In [4]:
print("Gensim version:", gensim.__version__)

Gensim version: 4.3.2


In [None]:
# set working directory (for Windows Subsystem for Linux)
os.chdir("working_directory_path")

In [6]:
# load the data
X_toksen_train = pd.read_csv('./y_broad/X_toksen_train.csv', converters={'tokenized_sen_filtered': literal_eval})
X_toksen_test = pd.read_csv('./y_broad/X_toksen_test.csv', converters={'tokenized_sen_filtered': literal_eval})
data_unlabelled = pd.read_csv('./data_unlabelled_tok_fil.csv', converters={'tokenized_sen_filtered': literal_eval})

## 2. Set up fastText model

Load fastText Model:

In [7]:
# set the path
embeddings_path1 = "/mnt/d/13b_language_models/cc.zh.300.bin"

fasttext_model = FastText.load_fasttext_format(embeddings_path1)
DeprecationWarning: Call to deprecated `load_fasttext_format` (use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model (to continue training with the loaded full model, more RAM) instead).

In [8]:
# load the model / the vectors
fasttext_model = FastText.load_fasttext_format(embeddings_path1)

  fasttext_model = FastText.load_fasttext_format(embeddings_path1)


In [12]:
type(fasttext_model)

gensim.models.fasttext.FastText

In [13]:
type(X_toksen_train['tokenized_sen_filtered'])

pandas.core.series.Series

In [9]:
# Define a function to calculate the mean embedding for a list of words
def calculate_mean_embedding(word_list, model):
    return np.mean([model.wv[word] for word in word_list if word in model.wv], axis=0)

## 3. Convert data to fastText embeddings
### 3.1 Training data

In [15]:
X_toksen_train['tokenized_sen_filtered']

0        [加强, 部门, 间, 工作, 协同, 全面, 对接, 社会, 救助, 经办, 服务, 各地...
1        [按规定, 定期, 社会, 公布, 基金, 收支, 情况, 参合, 人员, 待遇, 享受, ...
2        [事实, 无人, 抚养, 儿童, 监护人, 受, 监护人, 委托, 近亲属, 填写, 事实,...
3                    [慢性病, 种, 补偿, 名录, 呼吸系统, 慢性, 支气管炎, 肺气肿]
4        [市, 外, 省内, 定点, 医疗机构, 住院, 医疗, 待遇, 起付, 标准, 支付, 比...
                               ...                        
13571                              [人民团体, 民主党派, 石嘴山市, 委员会]
13572                                   [国家, 医疗, 保障局, 办公室]
13573                                      [七年, 十一月, 二十九日]
13574                               [分类管理, 提升, 供应, 保障, 水平]
13575    [坚持, 完善, 覆盖, 全民, 依法, 参加, 基本, 医疗保险, 制度, 政策, 体系,...
Name: tokenized_sen_filtered, Length: 13576, dtype: object

In [21]:
# Apply the function to each element in the 'tokenized_sen_filtered' series
train_embeddedFT1 = X_toksen_train['tokenized_sen_filtered'].apply(lambda x: calculate_mean_embedding(x, fasttext_model))

In [None]:
# check the data
train_embeddedFT1

0        [-0.08964916, 0.03979323, 0.42967716, -0.04902...
1        [-0.03411653, 0.046872012, 0.41202742, -0.0887...
2        [-0.07017996, 0.0112573635, 0.3273037, -0.0663...
3        [-0.020430032, -0.022946509, 0.28497145, 0.008...
4        [0.13345155, 0.05627731, 0.3490851, -0.0545071...
                               ...                        
13571    [-0.042736597, -0.024878126, 0.10776475, -0.05...
13572    [-0.023080138, 0.040626887, 0.38122424, -0.120...
13573    [0.013475258, 0.06671035, 0.3204395, 0.0968477...
13574    [-0.032232482, 0.0055650724, 0.36209345, 0.012...
13575    [0.01714691, 0.045198016, 0.3901582, -0.094556...
Name: tokenized_sen_filtered, Length: 13576, dtype: object

In [17]:
# Convert the result to a NumPy array
train_embeddedFT1 = np.array(train_embeddedFT1.tolist())

In [None]:
# check the shape
train_embeddedFT1.shape

(13576, 300)

In [19]:
# save the data
np.savetxt("./y_broad/train_embeddedFT_300.csv", train_embeddedFT1, delimiter=",")

### 3.2 Test data

In [None]:
X_toksen_test['tokenized_sen_filtered']

In [20]:
# Vectorize Test data
test_embeddedFT1 = X_toksen_test['tokenized_sen_filtered'].apply(lambda x: calculate_mean_embedding(x, fasttext_model))

In [None]:
# check the data
test_embeddedFT1

0       [0.0419377, 0.05744974, 0.4677836, -0.02952056...
1       [-0.005158877, 0.021575233, 0.2659846, -0.0599...
2       [-0.015081197, -0.0019222625, 0.38544732, 0.04...
3       [-0.026768051, 0.06382229, 0.42341638, -0.1173...
4       [0.08470531, 0.049810894, 0.40017432, -0.03587...
                              ...                        
6683    [0.06798019, 0.060740087, 0.42204273, 0.000957...
6684    [-0.085426144, 0.07721641, 0.468047, 0.0058804...
6685    [0.0069559556, 0.07863893, 0.3840597, -0.03239...
6686    [-0.07040116, 0.07836889, 0.39346752, -0.06034...
6687    [-0.021006811, 0.0028569205, 0.24264129, -0.05...
Name: tokenized_sen_filtered, Length: 6688, dtype: object

In [24]:
# Convert the result to a NumPy array
test_embeddedFT1 = np.array(test_embeddedFT1.tolist())

In [25]:
# check the dimensions
test_embeddedFT1.shape

(6688, 300)

In [26]:
# save the data
np.savetxt("./y_broad/test_embeddedFT_300.csv", test_embeddedFT1, delimiter=",")

### 3.3 Unlabelled data

In [28]:
data_unlabelled['tokenized_sen_filtered']

0         [中外合资, 合作医疗, 机构, 管理, 暂行办法, 补充规定, 中华人民共和国, 卫生部,...
1         [卫生部, 部长, 陈竺, 商务部, 部长, 陈德铭, ○, ○, 七年, 十二月, 三十日...
2         [规定, 香港, 澳门, 服务提供者, 应, 符合, 内地, 香港, 建立, 紧密, 经贸关...
3         [香港, 澳门, 服务提供者, 内地, 设立, 合资, 合作医疗, 机构, 规定, 参照, ...
4                                              [规定, 日起, 施行]
                                ...                        
993521                                             [不予, 报销]
993522    [第四章, 附则, 第二十四条, 技术, 方案, 八年, 六月, 一日, 统一, 执行, 原...
993523    [第二十五条, 方案, 龙胜各族自治县, 新型农村, 合作医疗, 管理, 办公室, 负责, 解释]
993524    [龙胜各族自治县, 人民政府, 成立, 自治县, 健康, 扶贫, ·, 医疗, 救助, 公益...
993525                                 [龙胜各族自治县, 人民政府, 办公室]
Name: tokenized_sen_filtered, Length: 993526, dtype: object

In [10]:
# Vectorize unlabelled data
unlabelled_embeddedFT1 = data_unlabelled['tokenized_sen_filtered'].apply(lambda x: calculate_mean_embedding(x, fasttext_model))

In [11]:
# check the data
unlabelled_embeddedFT1

0         [-0.0444134, 0.018246418, 0.22672877, -0.06710...
1         [0.023003206, -0.014322361, 0.27057192, -0.079...
2         [0.053714447, -0.015653312, 0.3240408, -0.0817...
3         [0.0039846506, -0.0053630774, 0.29727083, -0.1...
4         [0.02381768, 0.052716304, 0.45353666, -0.23150...
                                ...                        
993521    [-0.0084002465, 0.05405068, 0.40219447, -0.030...
993522    [0.0015540083, 0.095434986, 0.39776835, -0.087...
993523    [-0.014256788, 0.030602433, 0.2537891, 0.00376...
993524    [-0.014659241, 0.021233687, 0.3038526, -0.0527...
993525    [-0.020915411, -0.0040786113, 0.102900915, -0....
Name: tokenized_sen_filtered, Length: 993526, dtype: object

In [13]:
# Convert the result to a NumPy array
unlabelled_embeddedFT1 = np.array(unlabelled_embeddedFT1.tolist())

In [12]:
# check dimensions
unlabelled_embeddedFT1.shape

(993526,)

In [34]:
# save the data
np.savetxt("./unlabelled_embeddedFT1.csv", unlabelled_embeddedFT1, delimiter=",")