In [12]:
import string
import scipy
import nltk
from nltk import PorterStemmer, word_tokenize
from nltk.corpus import stopwords
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from ordered_set import OrderedSet
import pandas as pd
import numpy as np
import re

In [3]:
def get_and_clean_data():
    data = pd.read_excel('resources/JDT.xlsx')
    description = data['summary']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [4]:
def create_stem_cache(cleaned_description):
    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))
    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)
    return stem_cache

In [5]:
def create_custom_preprocessor(stop_dict, stem_cache):
    def custom_preprocessor(s):
        ps = PorterStemmer()
        s = re.sub(r'[^A-Za-z]', ' ', s)
        s = re.sub(r'\s+', ' ', s)
        s = word_tokenize(s)
        s = list(OrderedSet(s)- stop_dict)
        s = [word for word in s if len(word)>2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = ' '.join(s)
        return s
    return custom_preprocessor

In [6]:
def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)

# tfidf

In [3]:
import nltk

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
cleaned_description = get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English'))
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
vectorizer_unigram = CountVectorizer(preprocessor=my_custom_preprocessor)
vectorizer_unigram.fit(cleaned_description)
X_unigram = vectorizer_unigram.transform(cleaned_description)
N_unigram = len(cleaned_description)
df_unigram = np.array((X_unigram.todense() > 0).sum(0))[0]
idf_unigram = np.log10(1 + (N_unigram / df_unigram))
tf_unigram = np.log10(X_unigram.todense() + 1)
tf_idf_unigram = np.multiply(tf_unigram, idf_unigram)
X_unigram = scipy.sparse.csr_matrix(tf_idf_unigram)
X_df_unigram = pd.DataFrame(X_unigram.toarray(), columns=vectorizer_unigram.get_feature_names_out())
max_term_unigram = X_df_unigram.sum().sort_values(ascending=False)[:20].index

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\YC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\YC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'get_and_clean_data' is not defined

BM25

In [None]:
class BM25(object):
    def __init__(self, vectorizer, b=0.75, k1=1.6):
        self.vectorizer = vectorizer
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        self.y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = self.y.sum(1).mean()

    def transform(self, q):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        len_y = self.y.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        y = self.y.tocsc()[:, q.indices]
        denom = y + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = y.multiply(np.broadcast_to(idf, y.shape)) * (k1 + 1)
        return (numer / denom).sum(1).A1

In [None]:
cleaned_description = get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English'))
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
tf_idf_vectorizer = TfidfVectorizer(preprocessor=my_custom_preprocessor, use_idf=True)
tf_idf_vectorizer.fit(cleaned_description)
bm25 = BM25(tf_idf_vectorizer)
bm25.fit(cleaned_description)

tf-idf+lsd+lda

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1,1)) 
count_vectorizer.fit(cleaned_title + cleaned_body) 
X_tf_fit = count_vectorizer.transform(data_fit['title']) 
X_tf_blindtest = count_vectorizer.transform(data_blindtest['title']) 
lda = LatentDirichletAllocation(n_components=500, random_state=0) 
lda.fit(X_tf_fit) 
X_lda_fit = lda.transform(X_tf_fit)
gbm_model_with_lda = lgb.LGBMClassifier() 

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean() 
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean() 

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score)) 

X_fit_with_lda = hstack([X_tfidf_fit, X_lda_fit]).tocsr() 

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean() 
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, 
n_jobs=-2, scoring='f1_macro').mean() 

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

In [None]:
gbm_model_with_lsa_lda = lgb.LGBMClassifier()
X_fit_with_lsa_lda = hstack([X_tfidf_fit, X_lsa_fit,X_lda_fit]).tocsr()
precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa_lda, X_fit_with_lsa_lda, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean() 
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa_lda, X_fit_with_lsa_lda, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa_lda, X_fit_with_lsa_lda, y_fit, cv=5, 
n_jobs=-2, scoring='f1_macro').mean() 
print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

tfidf+lsa 降维处理

In [None]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack
lsa = TruncatedSVD(n_components=500, n_iter=100, random_state=0)
lsa.fit(X_tfidf_fit)
X_lsa_fit = lsa.transform(X_tfidf_fit)

gbm_model_with_lsa = lgb.LGBMClassifier()

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

X_fit_with_lsa = hstack([X_tfidf_fit, X_lsa_fit]).tocsr()

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

In [None]:
tfidf+LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1,1)) 
count_vectorizer.fit(cleaned_title + cleaned_body) 
X_tf_fit = count_vectorizer.transform(data_fit['title']) 
# X_tf_blindtest = count_vectorizer.transform(data_blindtest['title']) 
lda = LatentDirichletAllocation(n_components=500, random_state=0) 
lda.fit(X_tf_fit) 
X_lda_fit = lda.transform(X_tf_fit)
gbm_model_with_lda = gbm_model

X_fit_with_lda = hstack([X_tfidf_fit, X_lda_fit]).tocsr() 

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean() 
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, 
n_jobs=-2, scoring='f1_macro').mean() 

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

In [None]:
lsa

In [None]:
# Just to perform dimension reduction and build the model
# 只是为了降维和建构模型
from sklearn.decomposition import TruncatedSVD

# TruncatedSVD 初始化：创建了一个TruncatedSVD（截断奇异值分解）的实例 lsa，设置了 n_components=500 表示要降维到的目标维度为500。
lsa = TruncatedSVD(n_components=500, n_iter=100, random_state=0)
# 拟合 TruncatedSVD：使用训练集的 TF-IDF 特征 X_tfidf_fit 对 TruncatedSVD 模型进行拟合。
lsa.fit(X_tfidf_fit)
# TruncatedSVD 转换：使用拟合好的 TruncatedSVD 模型将训练集的 TF-IDF 特征降维为500维。
X_lsa_fit = lsa.transform(X_tfidf_fit)

# LGBM 模型初始化：创建了一个新的LightGBM分类器实例 gbm_model_with_lsa。
gbm_model_with_lsa = lgb.LGBMClassifier()

# 交叉验证：使用交叉验证对 LightGBM 模型进行评估，分别计算了精确率、召回率和 F1 分数。
precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

# 打印结果
print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

# 结合 TF-IDF 和 TruncatedSVD 特征：将 TF-IDF 特征和降维后的 TruncatedSVD 特征水平堆叠，以创建新的特征矩阵。
X_fit_with_lsa = hstack([X_tfidf_fit, X_lsa_fit]).tocsr()

# 再次交叉验证：使用新的特征矩阵对 LightGBM 模型进行再次交叉验证，计算精确率、召回率和 F1 分数。
precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

# 打印结果：将交叉验证得到的精确率、召回率和 F1 分数打印出来。
print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))


In [None]:
lda

In [None]:
# Fit an LDA model as a preprocessor
# 拟合LDA模型作为预处理器
from sklearn.decomposition import LatentDirichletAllocation

# CountVectorizer 初始化：创建一个 CountVectorizer 实例 count_vectorizer，用于将文本转换为词频向量。
# 设置 ngram_range=(1,1) 表示只考虑单个词（unigram）的词频。
count_vectorizer = CountVectorizer(ngram_range=(1,1))
# 拟合 CountVectorizer：使用训练集的标题和正文数据拟合 CountVectorizer。
count_vectorizer.fit(cleaned_title + cleaned_body)
# 转换数据集：使用 CountVectorizer 将训练集和测试集的标题数据转换为词频向量。
X_tf_fit = count_vectorizer.transform(data_fit['title'])
X_tf_blindtest = count_vectorizer.transform(data_blindtest['title'])
# LDA 拟合：使用拟合好的词频向量 X_tf_fit 对 LDA 模型进行拟合，设置了 n_components=500 表示要学习的主题数为 500。
lda = LatentDirichletAllocation(n_components=500, random_state=0)
lda.fit(X_tf_fit)
# LDA 转换：使用拟合好的 LDA 模型将训练集的词频向量转换为 LDA 主题向量。
X_lda_fit = lda.transform(X_tf_fit)
# LGBM 模型初始化：创建一个 LightGBM 分类器实例 gbm_model_with_lda。
gbm_model_with_lda = lgb.LGBMClassifier()

# 交叉验证：使用交叉验证对 LightGBM 模型进行评估，分别计算了精确率、召回率和 F1 分数。
precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()
# 打印结果
print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

# 结合 TF-IDF 和 LDA 特征：将 TF-IDF 特征和 LDA 主题向量水平堆叠，以创建新的特征矩阵。
X_fit_with_lda = hstack([X_tfidf_fit, X_lda_fit]).tocsr()

# 再次交叉验证：使用新的特征矩阵对 LightGBM 模型进行再次交叉验证，计算精确率、召回率和 F1 分数。
precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()
# 打印结果
print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))
