#### 1.加载数据集

In [1]:
import os as os
import numpy as np
from sklearn.model_selection import train_test_split

datapath = r'../IMDB数据集'
save_dir = r'../imbd'

def get_data(datapath):
    pos_files = os.listdir(datapath + '/pos')
    neg_files = os.listdir(datapath + '/neg')
    print(len(pos_files))
    print(len(neg_files))

    pos_all = []
    neg_all = []
    for pf, nf in zip(pos_files, neg_files):
        with open(datapath + '/pos' + '/' + pf, encoding='utf-8') as f:
            s = f.read()
            pos_all.append(s)
        with open(datapath + '/neg' + '/' + nf, encoding='utf-8') as f:
            s = f.read()
            neg_all.append(s)

    X_orig= np.array(pos_all + neg_all)
    Y_orig = np.array([1 for _ in range(len(pos_all))] + [0 for _ in range(len(neg_all))])
    print("X_orig:", X_orig.shape)
    print("Y_orig:", Y_orig.shape)

    return X_orig, Y_orig

def generate_train_data():
    X_orig, Y_orig = get_data(datapath+r'/train')
    X_test, Y__test = get_data(datapath+r'/test')
    X = np.concatenate([X_orig, X_test])
    Y = np.concatenate([Y_orig, Y__test])
    np.random.seed = 1
    random_indexs = np.random.permutation(len(X))
    X = X[random_indexs]
    Y = Y[random_indexs]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
    print("X_train:", X_train.shape)
    print("y_train:", y_train.shape)
    print("X_test:", X_test.shape)
    print("y_test:", y_test.shape)
    print("x_val:", X_val.shape)
    print("y_val:", y_val.shape)
    np.savez(save_dir + '/imdb_train', x=X_train, y=y_train)
    np.savez(save_dir + '/imdb_test', x=X_test, y=y_test)
    np.savez(save_dir + '/imdb_val', x=X_val, y=y_val)

if __name__ == '__main__':
    generate_train_data()


12500
12500
X_orig: (25000,)
Y_orig: (25000,)
12500
12500
X_orig: (25000,)
Y_orig: (25000,)
X_train: (36000,)
y_train: (36000,)
X_test: (10000,)
y_test: (10000,)
x_val: (4000,)
y_val: (4000,)


FileNotFoundError: [Errno 2] No such file or directory: '../imbd/imdb_train.npz'

In [4]:
import tensorflow as tf

# 加载IMDB数据集
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)

# 将整数标签转换为单词文本
word_index = tf.keras.datasets.imdb.get_word_index()

# 创建一个反向映射词典
index_word = {index: word for word, index in word_index.items()}

# 将电影评论的数字转为单词
train_texts = [' '.join([index_word.get(i - 3, '?') for i in review]) for review in train_data]
test_texts = [' '.join([index_word.get(i - 3, '?') for i in review]) for review in test_data]

# 合并训练和测试数据
texts = train_texts + test_texts

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from tqdm import tqdm  # 导入 tqdm

# 下载 NLTK 停用词库
nltk.download('stopwords')

# 获取英文停用词
stop_words = set(stopwords.words('english'))

# 定义文本清理函数
def clean_text(text):
    # 转小写
    text = text.lower()

    # 去除标点符号和非字母字符
    text = re.sub(r'[^a-z\s]', '', text)

    # 按空格分词
    words = text.split()

    # 去除停用词
    cleaned_words = [word for word in words if word not in stop_words]

    # 重新组合为字符串
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

# 使用 tqdm 显示进度条，对所有文本应用清理函数
cleaned_texts = []
for text in tqdm(texts, desc="Cleaning Texts", unit="text"):
    cleaned_texts.append(clean_text(text))


In [9]:
import pandas as pd

df1 = pd.DataFrame({
    'Text': texts[:10],  
})
# 显示表格
df1[0]

Unnamed: 0,Text
0,? this film was just brilliant casting locatio...
1,? big hair big boobs bad music and a giant saf...
2,? this has to be one of the worst films of the...
3,? the ? ? at storytelling the traditional sort...
4,? worst mistake of my life br br i picked this...


In [8]:
df2 = pd.DataFrame({
    'Text': cleaned_texts[:10],  
})
# 显示表格
df2.head()

Unnamed: 0,Text
0,film brilliant casting location scenery story ...
1,big hair big boobs bad music giant safety pin ...
2,one worst films friends watching film target a...
3,storytelling traditional sort many years event...
4,worst mistake life br br picked movie target f...


In [10]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# 训练词向量
model = Word2Vec(LineSentence("cleaned_texts.txt"), vector_size=50, window=5, min_count=1, workers=5, sg=0)

# 保存模型
model.save("word2vec_50dim_large.model")

# 将词向量保存为 txt 文件
with open("txt/word2vec_vectors_50d.txt", "w", encoding="utf-8") as f:
    # 写入词汇表的大小和维度
    f.write(f"{len(model.wv.index_to_key)} {model.vector_size}\n")
    
    # 遍历词向量并写入文件
    for word in model.wv.index_to_key:
        vector = model.wv[word]
        vector_str = " ".join(map(str, vector))  # 将向量转换为字符串
        f.write(f"{word} {vector_str}\n")


In [12]:
# 加载已经训练好的模型
model = Word2Vec.load("word2vec_50dim_large.model")

# 获取词向量
vector = model.wv['learning']
print(f"Vector for 'learning':\n", vector)

# 查找与指定词相似的词
similar = model.wv.most_similar('learning', topn=5)
print(f"Words most similar to 'learning':\n", similar)


Vector for 'machine':
 [-1.25276685e+00 -4.51283097e-01 -3.36615205e-01 -6.01255715e-01
 -4.40078348e-01  5.31646550e-01 -1.84708416e-01 -1.30145311e-01
  1.09507535e-02  3.73301119e-01 -9.03674662e-02 -1.12248409e+00
  5.69665492e-01 -7.67398715e-01 -5.39114833e-01  3.85416210e-01
 -1.18252695e-01 -1.13752866e+00  6.59926757e-02 -1.28904128e+00
 -7.75832683e-02 -1.23484731e-01 -2.81328976e-01  5.52245140e-01
 -5.69858193e-01  1.26973704e-01 -2.01457962e-01 -4.28240836e-01
  6.98231459e-01 -5.16424537e-01  1.03311682e+00 -4.56775427e-01
  6.63869262e-01  6.42004251e-01 -8.02595913e-01  1.35936129e+00
  5.69961250e-01  1.01413047e+00 -1.15683176e-01 -8.61742126e-04
  3.73278886e-01 -4.19324607e-01 -7.40183443e-02  1.19746542e+00
  4.65165675e-01  1.39669582e-01  1.05144703e+00 -7.96834946e-01
 -2.02922747e-01  2.02567413e-01]
Words most similar to 'machine':
 [('taught', 0.7358053922653198), ('teaching', 0.7238261699676514), ('education', 0.701050877571106), ('accepted', 0.6917518973350