# Vectorization

In this file, the tokenized and filtered sentences are vectorized via multihot-encoding and term-frequency-inverse-document-frequency (TF-IDF). 

### 1. Load packages and data

In [33]:
import pandas as pd # data manipulation
import numpy as np
import joblib
import os
import random
from ast import literal_eval # ensures that the tokenized sentences are read as a list.

# for encoding with sparse matrices
from scipy.sparse import vstack, csr_matrix

# TF-IDF encoding
from sklearn.feature_extraction.text import TfidfVectorizer


Python Version: 3.10.13

In [2]:
# versions
np.__version__, pd.__version__, joblib.__version__

('1.26.4', '2.2.3', '1.3.2')

In [None]:
# set a seed for reproducibility
random.seed(421)           # Python random module
np.random.seed(421)        # NumPy random generator

In [None]:
# set working directory
os.chdir("working_directory_path") 

In [None]:
# load the tokenized sentences of the training data
X_toksen_train = pd.read_csv('./y_broad/X_toksen_train.csv', converters={'tokenized_sen_filtered': literal_eval})
X_toksen_train.head()

Unnamed: 0,tokenized_sen_filtered
0,"[加强, 部门, 间, 工作, 协同, 全面, 对接, 社会, 救助, 经办, 服务, 各地..."
1,"[按规定, 定期, 社会, 公布, 基金, 收支, 情况, 参合, 人员, 待遇, 享受, ..."
2,"[事实, 无人, 抚养, 儿童, 监护人, 受, 监护人, 委托, 近亲属, 填写, 事实,..."
3,"[慢性病, 种, 补偿, 名录, 呼吸系统, 慢性, 支气管炎, 肺气肿]"
4,"[市, 外, 省内, 定点, 医疗机构, 住院, 医疗, 待遇, 起付, 标准, 支付, 比..."


In [None]:
# load the tokenized sentences of the test data
X_toksen_test = pd.read_csv('./y_broad/X_toksen_test.csv', converters={'tokenized_sen_filtered': literal_eval})
X_toksen_test.head()

Unnamed: 0,tokenized_sen_filtered
0,"[市, 外, 转诊, 有效, 期满, 参保, 人员, 如需, 再次, 市, 外, 转诊, 应..."
1,"[第四十五, 条, 城乡居民, 医保, 执行, 统一, 基金, 财务制度, 会计制度, 基金..."
2,"[试点, 运行, 切实, 提高, 失能, 人员, 家属, 获得, 感, 幸福感]"
3,"[大力, 发挥, 网络, 报刊, 广播, 电视, 媒体, 宣传, 优势, 开设, 专版, 公..."
4,"[全省, 一体化, 平台, 建设, 方案, 力争, 全省, 医保, 信息系统, 灾备, 中心..."


In [26]:
# load unlabelled data
data_unlabelled = pd.read_csv('./data_unlabelled_tok_fil.csv', converters={'tokenized_sen_filtered': literal_eval})
data_unlabelled

Unnamed: 0,doc_index,ran200doc,sentences,ran20sen,sen_index,tokenized_sen,tokenized_sen_filtered
0,1,137,《中外合资、合作医疗机构管理暂行办法》的补充规定 中华...,16,1,"[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...","[中外合资, 合作医疗, 机构, 管理, 暂行办法, 补充规定, 中华人民共和国, 卫生部,..."
1,1,137,卫生部部长：陈竺商务部部长：陈德铭二○○七年十二月三十日 《中外合资、合作医疗机构管理暂行...,18,2,"['卫生部', '部长', '：', '陈竺', '商务部', '部长', '：', '陈德...","[卫生部, 部长, 陈竺, 商务部, 部长, 陈德铭, ○, ○, 七年, 十二月, 三十日..."
2,1,137,二、本规定中香港、澳门服务提供者应分别符合《内地与香港关于建立更紧密经贸关系的安排》及《...,18,3,"['\u3000', '\u3000', '二', '、', '本', '规定', '中',...","[规定, 香港, 澳门, 服务提供者, 应, 符合, 内地, 香港, 建立, 紧密, 经贸关..."
3,1,137,三、香港、澳门服务提供者在内地设立合资、合作医疗机构的其他规定，仍参照《中外合资、合作医...,7,4,"['\u3000', '\u3000', '三', '、', '香港', '、', '澳门'...","[香港, 澳门, 服务提供者, 内地, 设立, 合资, 合作医疗, 机构, 规定, 参照, ..."
4,1,137,四、本规定自2008年1月1日起施行,16,5,"['\u3000', '\u3000', '四', '、', '本', '规定', '自',...","[规定, 日起, 施行]"
...,...,...,...,...,...,...,...
993521,31138,35,否则不予报销,7,1003131,"['否则', '不予', '报销']","[不予, 报销]"
993522,31138,35,第四章　附则 第二十四条　本技术方案从二00八年六月一日起统一执行，同时原制定的补偿方案作废,8,1003132,"['\u3000', '第四章', '\u3000', '附则', '\u3000', '\...","[第四章, 附则, 第二十四条, 技术, 方案, 八年, 六月, 一日, 统一, 执行, 原..."
993523,31138,35,第二十五条　本方案由龙胜各族自治县新型农村合作医疗管理办公室负责解释,9,1003133,"['\u3000', '\u3000', '第二十五条', '\u3000', '本', '...","[第二十五条, 方案, 龙胜各族自治县, 新型农村, 合作医疗, 管理, 办公室, 负责, 解释]"
993524,31139,91,龙胜各族自治县人民政府关于成立自治县“健康扶贫·医疗救助”公益基金管理工作领导小组的通知龙胜...,14,1003135,"['龙胜各族自治县', '人民政府', '关于', '成立', '自治县', '“', '健...","[龙胜各族自治县, 人民政府, 成立, 自治县, 健康, 扶贫, ·, 医疗, 救助, 公益..."


### 2. Multihot Encoding

#### 2.1 Prepare vocabulary and index

Multihot encoding creates a dummy variable for each word in the corpus.
The first step is to create a vocabulary from the training data:

Step 1: create a list of all tokens, looping over the sentences and the tokens in the sentences.

In [75]:
# Create a list of all tokens
all_tokens = [token for sentence in X_toksen_train['tokenized_sen_filtered'] for token in sentence]
len(all_tokens)

330751

In [76]:
# check observations
print(all_tokens[20000:20020])

['例', '内科', '治疗', '翼状', '胬肉', '单侧', '胬肉', '切除', '手术', '例', '手术', '治疗', '翼状', '胬肉', '双侧', '胬肉', '切除', '手术', '手术', '治疗']


In [77]:
# create vocabulary through set
vocabulary = list(set(all_tokens))
len(vocabulary)

18559

The vocabulary consists of 18,559 tokens. We save it for later to name the features in the multihot datasets.

In [78]:
# save vocabulary for later data
pd.DataFrame(vocabulary).to_csv('vocabulary.csv')

In [79]:
# recover vocabulary, get column and turn to list
vocabulary = pd.read_csv('vocabulary.csv').iloc[:,1].tolist()
type(vocabulary)

list

In [80]:
# check vocabulary entries
vocabulary[100:110]

['物价水平', 'X光', '实事', '从重处理', '分装', '跑腿', '重在', '考量', '支撑', '会商']

In [81]:
# check data type
type(vocabulary)

list

Now we create a dictionary, which includes the vocabulary and an index that enumerates the vocabulary, so that each word is linked to a number.

In [82]:
# Create a dictionary to map of tokens to indices
token_to_index = {token: index for index, token in enumerate(vocabulary)}
type(token_to_index)

dict

In [83]:
token_to_index

{'入市': 0,
 '助学金': 1,
 '停保': 2,
 '征询': 3,
 '二十五年': 4,
 '党校': 5,
 '及下': 6,
 '送市': 7,
 '氢溴酸': 8,
 '中轴': 9,
 '盼': 10,
 '气管': 11,
 '刑事案件': 12,
 '未戴': 13,
 '免收': 14,
 '杜舟': 15,
 '事中': 16,
 '名非': 17,
 '聚集': 18,
 '土地': 19,
 '四中全会': 20,
 '正位': 21,
 '中重': 22,
 '坚固': 23,
 '短缺': 24,
 '第四十条': 25,
 '承办人': 26,
 '医疗卫生': 27,
 '实质性': 28,
 '规范': 29,
 '变通': 30,
 '肇庆市': 31,
 '问答': 32,
 '生活用品': 33,
 '高港区': 34,
 '20%': 35,
 '列账': 36,
 '翻': 37,
 '五中': 38,
 '雇佣': 39,
 '骶': 40,
 '周秋明': 41,
 '镇级': 42,
 '蝶呤': 43,
 '政协提案': 44,
 '无望': 45,
 '即可': 46,
 '挪用': 47,
 '湖里区': 48,
 'Ｍ': 49,
 '托管': 50,
 '申请理由': 51,
 '保险': 52,
 '治疗仪': 53,
 '告示': 54,
 '85.5%': 55,
 '二年': 56,
 '纳入': 57,
 '维平': 58,
 '仍为': 59,
 '机关': 60,
 '七月': 61,
 '传播': 62,
 '李沧区': 63,
 '沿用': 64,
 '工作汇报': 65,
 '慈善机构': 66,
 '特快专递': 67,
 '异常': 68,
 '恶劣': 69,
 '内审': 70,
 '全年': 71,
 '姜': 72,
 '采样': 73,
 '榭': 74,
 '预支': 75,
 '指非': 76,
 '销售费用': 77,
 '自信': 78,
 '幼儿园': 79,
 '脑科': 80,
 '回扣': 81,
 '四肢': 82,
 '机等': 83,
 '糖苷酶': 84,
 '选项': 85,
 '时到': 86,
 '丹参': 87,
 '情况': 8

#### 2.2 Encoding functions

Multihot encoding here relies on 3 functions and uses a memory-efficient datatype (np.unit8).

1. The multihot sentence encoder functgion proceeds in several steps:\
Step 1: Create an array of zeros with the specified vocabulary_size\
Step 2: Iterate through each token in the tokenized and filtered sentences: Check if the token is present in the vocabulary dictionary (2a), and if so set the corresponding index in the encoding array to 1 (2b).\
Step 3: Return the multi-hot encoded array.

The result is a binary encoding where each position in the array represents the presence or absence of a token in the input tokenized_sen. If a token is present, the corresponding position in the encoding is set to 1; otherwise, it remains 0.

In [84]:
## Encoding function:
def multihot_sentence_encoder(tokenized_sen, vocabulary_size, token_to_index):
    encoding = np.zeros(vocabulary_size, dtype=np.uint8) # step 1, non-demanding data type
    for token in tokenized_sen: # step 2
        if token in token_to_index: # 2a
            encoding[token_to_index[token]] = 1 # 2b
    return encoding # step 3

2. The function encode_batch applies multihot encoding to a batch of sentences, rather than the whole dataset at a time. It returns a sparse matrix (CSR format) with the encoded batch, rather than a regular numpy array. Both steps help limiting the computational resources required to process the data.

In [85]:

def encode_batch(sentences, vocabulary_size, token_to_index):
    encoded_batch = [multihot_sentence_encoder(sen, vocabulary_size, token_to_index) for sen in sentences]
    return csr_matrix(encoded_batch, dtype=np.uint8)

3. The function process_in_batches can be applied to an entire data frame. The argument batch_size determines how many sentences to process at a time, it can be adjusted to the computational resources available. It returs a sparse matrix representing the entire dataset with multihot encoding.

In [None]:

def process_in_batches(sentences, batch_size, vocabulary_size, token_to_index):
    all_encoded_data = []
    for start_idx in range(0, len(sentences), batch_size):
        batch_sentences = sentences[start_idx:start_idx+batch_size]
        encoded_batch = encode_batch(batch_sentences, vocabulary_size, token_to_index)
        all_encoded_data.append(encoded_batch)
        
    # Concatenate all batches and create to sparse matrix
    return vstack(all_encoded_data).astype(np.uint8) 

#### 2.3 Encode the data


First, encode the training data, save it as a sparse matrix, and then recover the data and load it with the vocabulary as column names.

In [87]:
# Process the training data in batches
X_multihot_train_sparse = process_in_batches(X_toksen_train['tokenized_sen_filtered'], 
                                         1000, # batch size
                                         len(vocabulary), token_to_index)
X_multihot_train_sparse

<Compressed Sparse Row sparse matrix of dtype 'uint8'
	with 251730 stored elements and shape (13576, 18559)>

In [88]:
# Save sparse matrix using joblib
joblib.dump(X_multihot_train_sparse, './y_broad/X_multihot_train_sparse.pkl')


['./y_broad/X_multihot_train_sparse.pkl']

In [89]:
# Load data
loaded_X_multihot_train_sparse = joblib.load('./y_broad/X_multihot_train_sparse.pkl')

In [90]:
# Create a DataFrame with the vocabulary as column names
X_multihot_train = pd.DataFrame(loaded_X_multihot_train_sparse.toarray(), columns=vocabulary)
X_multihot_train

Unnamed: 0,入市,助学金,停保,征询,二十五年,党校,及下,送市,氢溴酸,中轴,...,人性化,参合,法制观念,拔牙,元春,庞秀萍,1.3%,托,栏须,八村
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13573,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13574,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Then, process the test data. Recovery works as above.

In [91]:
# Process the training data in batches
X_multihot_test_sparse = process_in_batches(X_toksen_test['tokenized_sen_filtered'], 
                                         1000, # batch size
                                         len(vocabulary), token_to_index)
X_multihot_test_sparse

<Compressed Sparse Row sparse matrix of dtype 'uint8'
	with 120535 stored elements and shape (6688, 18559)>

In [92]:
# Save sparse matrix using joblib
joblib.dump(X_multihot_test_sparse, './y_broad/X_multihot_test_sparse.pkl')

['./y_broad/X_multihot_test_sparse.pkl']

Finally, process the unlabelled data.


In [93]:
# Process the data in batches
X_unlabelled_multihot_sparse = process_in_batches(data_unlabelled['tokenized_sen_filtered'], 
                                         1000, # batch size
                                         len(vocabulary), token_to_index)

In [94]:
# Save sparse matrix using joblib
joblib.dump(X_unlabelled_multihot_sparse, './X_unlabelled_multihot_sparse.pkl')

['./X_unlabelled_multihot_sparse.pkl']

Try to convert the sparse matrix into a regular dataframe with the 

In [73]:
# try to load the data with feature names
X_unlabelled_multihot = pd.DataFrame(X_unlabelled_multihot_sparse.toarray(), columns=vocabulary)

### 2.4 Visual inspection

Visual inspection 
You can reverse the encoding process to see if you can reconstruct the original sentence from the generated multi-hot encoding. However, this might not always be straightforward due to the loss of sequence information in multi-hot encoding.

In [28]:
# Load data
loaded_X_multihot_train_sparse = joblib.load('./y_broad/X_multihot_train_sparse.pkl')
# Create a DataFrame with the vocabulary as column names
X_multihot_train = pd.DataFrame(loaded_X_multihot_train_sparse.toarray(), columns=vocabulary)
X_multihot_train

Unnamed: 0,入市,助学金,停保,征询,二十五年,党校,及下,送市,氢溴酸,中轴,...,人性化,参合,法制观念,拔牙,元春,庞秀萍,1.3%,托,栏须,八村
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
print(f"Shape of the dense DataFrame: {X_multihot_train.shape}")
print(f"Vocabulary size: {len(vocabulary)}")
print(f"Shape of the sparse matrix: {loaded_X_multihot_train_sparse.shape}")

Shape of the dense DataFrame: (13576, 18559)
Vocabulary size: 18559
Shape of the sparse matrix: (13576, 18559)


In [30]:
# Reconstruct sentences from the dense DataFrame
reconstructed_sentences = []

for _, row in X_multihot_train.iterrows():
    reconstructed = row[row == 1].index.tolist() # Select columns where the value is 1
    reconstructed_sentences.append(reconstructed)

In [31]:
# Print reconstructed sentences
print(reconstructed_sentences[154])

['椎弓', '金牛', '肝铜', '一系列', '开单', '热情']


In [32]:
X_toksen_train['tokenized_sen_filtered'][154]

['健全', '统一', '规范', '医疗', '救助', '制度']

### 3. TF-IDF encoding

#### 3.1 Corpus creation

To generate TF-IDF (Term Frequency-Inverse-Document-Frequency) values from Chinese tokenized sentences stored as a column of lists in a Pandas DataFrame, we use the TfidfVectorizer class from the sklearn.

First, we create a corpus of space-separated tokens from the tokenized and filtered sentences in list form:

In [None]:
# Convert tokenized sentences to space-separated strings (training data)
corpus_train = [' '.join(sentence) for sentence in X_toksen_train['tokenized_sen_filtered']]

In [36]:
type(corpus_train)

list

In [41]:
# check dimensions
len(corpus_train), len(X_toksen_train)

(13576, 13576)

In [37]:
corpus_train[10:100]

['第二十八条 各级 人力 社保 行政部门 应 会同 部门 积极 推进 居民 医保 结算 制度 改革 实施 总额 控制 多种 结算 方式 有效 控制 医疗 费用 过快 增长 减轻 参保 人员 负担',
 '建立 动态 调整 高效 运行 多元 复合式 医保 支付 体系 充分发挥 基本 医疗保险 激励 约束 控制 医疗 费用 不合理 增长 作用',
 '做好 网站 安全 管理 维护 省 医保 局 委托 山西 云 时代 有限公司 为局 网站 安全 管理 提供 技术 人员 硬件 设备 保障',
 '已婚 女职工 放置 取出 节育环 医疗费 元 皮下 埋置 医疗费 元 结扎 医疗费 元 已婚 男 职工 结扎 医疗费 元',
 '银行 柜台 缴费 持 身份证 医保 本到 市县 工商银行 建设银行 农业银行 中国银行 交通银行 邮储 银行 广西 农信社 柳州 银行 桂林 银行 光大银行 兴业银行 北部湾 银行 家 签约 银行 储蓄 网点 柜台 缴纳 保费 利用 银行 提供 自助 缴费 线上 缴费 缴费 方式 缴纳 保费',
 '工作 要求 各级 新农 合 主管部门 高度重视 切实 落实 农村居民 重大 疾病 医疗保障 工作 深化 医改 重要 举措 积极争取 政府 部门 重大 疾病 医疗保障 工作 重视 支持',
 '保障 范围',
 '清洁 照料 会阴 清洁 次日 护理 对象 清洁 习惯 原则 次 会阴部 有无 伤口 有无 大小便 失禁 留置 尿管 帮助 护理 对象 完成 会阴部 擦洗 冲洗 水温 适宜 动作 轻柔 保护 隐私',
 '诊疗 设备 医用 材料 类 应用 眼科 准分子 激光 治疗仪 糖尿病 决策支持系统 人体 信息 诊断 电脑 选择 最佳 妊娠期 检查 治疗 费用 眼镜 义齿 义眼 义肢 助听器 康复 性器具 自用 保健 按摩 检查和 治疗 器械 按摩器 轮椅 拐杖 家用 检测 治疗 仪器 皮钢 背甲 腰围 钢 头颈 胃托 肾托 子宫 托 疝气 带 护膝 带 提睾带 健脑 器 药枕 药垫 热敷 袋 神功 元气 袋 费用 省 市 物价部门 规定 不可 单独 收费 一次性 医用 材料',
 '第二条 持有 本市 常住 户籍 符合 办法 规定 救助 条件 困难 居民 均 当地政府 获得 医疗 救助',
 '办公室 决策 执行 部门 落实 情况 实施 督查',
 '实行 一

In [38]:
# test corpus
corpus_test = [' '.join(sentence) for sentence in X_toksen_test['tokenized_sen_filtered']]

In [42]:
# check dimensions
len(corpus_test), len(X_toksen_test)

(6688, 6688)

In [43]:
# labelled corpus
corpus_unlabelled = [' '.join(sentence) for sentence in data_unlabelled['tokenized_sen_filtered']]

In [44]:
# check dimensions
len(corpus_unlabelled), len(data_unlabelled['tokenized_sen_filtered'])

(993526, 993526)

### 3.2 Transform corpus to TF-IDF vectors

Train the vectorizer on the training data, then vectorize all data sets with that vectorizer to ensure consistent vocabulary. To obtain a manageable number of features, only unigrams and bigrams are considered that appear (1) in at least 3 sentences, and (b) less than 80% of sentences.

In [45]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), # unigrams and bigrams
                                   min_df=3, # appears at least in 3 documents 
                                   max_df=0.8, # appears in at most 80% of the documents
                                   dtype = np.float32) # less demanding for memory

Train TF-IDF vectorizer on the training data:

In [47]:
# Fit and transform the corpus to TF-IDF vectors
tfidf_matrix_train = tfidf_vectorizer.fit_transform(corpus_train)
tfidf_matrix_train

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 351821 stored elements and shape (13576, 21960)>

In [None]:
# Check vocabulary and TF-IDF scores
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix_train.toarray())


Vocabulary: ['10' '10 二级' '10 以内' ... '龙岩' '龙岩市' '龙岩市 人民政府']
TF-IDF Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
# How many features were created?
len(tfidf_vectorizer.get_feature_names_out())

21960

In [51]:
# Extract the feature names
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
type(tfidf_feature_names)

numpy.ndarray

In [77]:
# save the feature names
pd.DataFrame(tfidf_feature_names).to_csv('tfidf_feature_names.csv')

In [52]:
# recover features, get column and turn to list
tfidf_feature_names = pd.read_csv('tfidf_feature_names.csv').iloc[:,1].tolist()
len(tfidf_feature_names)

21960

In [None]:
# save the data as a sparse matrix
joblib.dump(tfidf_matrix_train, './y_broad/tfidf_matrix_train.pkl')

['./y_broad/tfidf_matrix_train.pkl']

In [105]:
# Load data and create a data frame with feature names
loaded_X_tfidf_train_sparse = joblib.load('./y_broad/tfidf_matrix_train.pkl')
# Create a DataFrame with the vocabulary as column names
#X_tfidf_train = pd.DataFrame(loaded_X_tfidf_train_sparse.toarray(), columns=tfidf_feature_names)
#X_tfidf_train

Vectorize the test data:

In [57]:
# Transform the test data using the same vectorizer
tfidf_matrix_test = tfidf_vectorizer.transform(corpus_test)
tfidf_matrix_test

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 164329 stored elements and shape (6688, 21960)>

In [83]:
# save the data
joblib.dump(tfidf_matrix_test, './y_broad/tfidf_matrix_test.pkl')

['./y_broad/tfidf_matrix_test.pkl']

Vectorize the unlabelled data:

In [58]:
# Transform the unlabelled data using the same vectorizer
tfidf_matrix_unlabelled = tfidf_vectorizer.transform(corpus_unlabelled)
tfidf_matrix_unlabelled

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 24295333 stored elements and shape (993526, 21960)>

In [88]:
# save the data
joblib.dump(tfidf_matrix_unlabelled, './tfidf_matrix_unlabelled_broadvoc.pkl')

['./tfidf_matrix_unlabelled_broadvoc.pkl']

With the unlabelled data, we might encounter memory problems ...

In [None]:
# try to load the data with feature names
X_tfidf_unlabelled = pd.DataFrame(tfidf_matrix_unlabelled.toarray(), columns=tfidf_feature_names)

MemoryError: Unable to allocate 81.3 GiB for an array with shape (993526, 21960) and data type float32