In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [8]:
# read data
train_data = pd.read_csv('data/simplified_dataset/simplified_train_data_tokenized.csv')
test_data = pd.read_csv('data/simplified_dataset/simplified_test_data_tokenized.csv')

train_data = train_data.dropna(subset=['tokenized_review'])
test_data = test_data.dropna(subset=['tokenized_review'])

## EDA

In [3]:
raw_data = pd.read_csv("data/raw_data/online_shopping_10_cats.csv")

In [4]:
raw_data

Unnamed: 0,cat,label,review
0,书籍,1,﻿做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持...
1,书籍,1,作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...
2,书籍,1,作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...
3,书籍,1,作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...
4,书籍,1,作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...
...,...,...,...
62769,酒店,0,我们去盐城的时候那里的最低气温只有4度，晚上冷得要死，居然还不开空调，投诉到酒店客房部，得到...
62770,酒店,0,房间很小，整体设施老化，和四星的差距很大。毛巾太破旧了。早餐很简陋。房间隔音很差，隔两间房间...
62771,酒店,0,我感觉不行。。。性价比很差。不知道是银川都这样还是怎么的！
62772,酒店,0,房间时间长，进去有点异味！服务员是不是不够用啊！我在一楼找了半个小时以上才找到自己房间，想找...


In [5]:
result = raw_data.groupby('cat').size().reset_index(name='count')
result

Unnamed: 0,cat,count
0,书籍,3851
1,平板,10000
2,手机,2323
3,水果,10000
4,洗发水,10000
5,热水器,575
6,蒙牛,2033
7,衣服,10000
8,计算机,3992
9,酒店,10000


## word frequency

In [22]:
# string to vector
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['tokenized_review'])
X_test = vectorizer.transform(test_data['tokenized_review'])
y_train = train_data['cat']
y_test = test_data['cat']

In [26]:
classifier = MultinomialNB()

# 訓練模型
classifier.fit(X_train, y_train)

# 預測
y_pred = classifier.predict(X_test)

# 評估
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: ", accuracy)

Accuracy:  0.8680417297125109


## TF-IDF

### Bayes

In [35]:
def bayes_classifier(X_train_tfidf, y_train):
    classifier = MultinomialNB()

    # 訓練模型
    classifier.fit(X_train_tfidf, y_train)

    return classifier

def DecisionTree_classifier(X_train_tfidf, y_train):
    classifier = DecisionTreeClassifier()

    # 訓練模型
    classifier.fit(X_train_tfidf, y_train)

    return classifier

def evauation(classifier, X_test_tfidf):
    # 預測
    y_pred = classifier.predict(X_test_tfidf)

    # 評估
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [10]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=1)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['tokenized_review'])

In [15]:
X_train_tfidf

<50216x30618 sparse matrix of type '<class 'numpy.float64'>'
	with 30618 stored elements in Compressed Sparse Row format>

In [17]:
print(X_train_tfidf)

  (0, 6429)	0.7071067811865476
  (0, 11752)	0.7071067811865476
  (1, 12432)	1.0
  (3, 8009)	1.0
  (4, 30563)	0.40824829046386296
  (4, 14100)	0.40824829046386296
  (4, 12810)	0.40824829046386296
  (4, 14129)	0.40824829046386296
  (4, 13669)	0.40824829046386296
  (4, 14691)	0.40824829046386296
  (5, 10846)	1.0
  (6, 4546)	1.0
  (7, 15152)	0.26052362572959153
  (7, 29597)	0.26052362572959153
  (7, 27062)	0.26052362572959153
  (7, 19436)	0.4411048423733123
  (7, 7300)	0.26052362572959153
  (7, 14366)	0.26052362572959153
  (7, 7869)	0.26052362572959153
  (7, 2741)	0.26052362572959153
  (7, 4009)	0.26052362572959153
  (7, 2720)	0.26052362572959153
  (7, 10150)	0.4411048423733123
  (9, 21335)	0.2647122261374479
  (9, 14391)	0.2647122261374479
  :	:
  (50195, 18313)	0.7071067811865476
  (50195, 23499)	0.7071067811865476
  (50197, 3799)	0.4472135954999579
  (50197, 7103)	0.4472135954999579
  (50197, 3920)	0.4472135954999579
  (50197, 10569)	0.4472135954999579
  (50197, 4185)	0.4472135954999579

In [16]:
from scipy.sparse import find

# 假设要获取第一个文档（第一行）中词语 "做" 的 TF-IDF 分数
document_index = 0  # 选择文档的索引
word_to_find = "做"

# 查找指定文档中特定词语的非零元素的行索引和列索引
row_indices, col_indices, tfidf_scores = find(X_train_tfidf)

word_index = tfidf_vectorizer.vocabulary_.get(word_to_find)  # 获取词语的索引

if word_index is not None:
    indices = (row_indices == document_index) & (col_indices == word_index)
    if indices.any():
        tfidf_score = tfidf_scores[indices][0]
        print(f"TF-IDF 分数为 {tfidf_score}.")
    else:
        print(f"文档 {document_index} 中未找到词语 '{word_to_find}' 的 TF-IDF 分数。")
else:
    print(f"词语 '{word_to_find}' 未在词汇表中找到。")

词语 '做' 未在词汇表中找到。


In [36]:
def main(classifier):
    for df in range(1,10):
        df = df/10
        tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=df)
        X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['tokenized_review'])
        X_test_tfidf = tfidf_vectorizer.transform(test_data['tokenized_review'])

        classifier = bayes_classifier(X_train_tfidf, y_train)
        
        
        accuracy = evauation(classifier, X_test_tfidf)

        print(f"************ max_df = {df} ************")    
        print("Accuracy: ", accuracy)

In [37]:
main()

************ max_df = 0.1 ************
Accuracy:  0.8202596161503544
************ max_df = 0.2 ************
Accuracy:  0.8199410687266067
************ max_df = 0.3 ************
Accuracy:  0.8199410687266067
************ max_df = 0.4 ************
Accuracy:  0.8199410687266067
************ max_df = 0.5 ************
Accuracy:  0.8199410687266067
************ max_df = 0.6 ************
Accuracy:  0.8199410687266067
************ max_df = 0.7 ************
Accuracy:  0.8199410687266067
************ max_df = 0.8 ************
Accuracy:  0.8199410687266067
************ max_df = 0.9 ************
Accuracy:  0.8199410687266067


### Decision Tree

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [42]:
# 切出訓練子集
X_train_subset, _, y_train_subset, _ = train_test_split(X_train_tfidf, y_train, test_size=0.8, random_state=42)

In [43]:
# 訓練
classifier = DecisionTreeClassifier(random_state=42)

# parameters grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],  # 这里添加不同的max_depth值
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_subset, y_train_subset)


In [44]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [45]:
print(best_params)
print(best_score)

{'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.7141295445396116


In [46]:
best_classifier = DecisionTreeClassifier(criterion=best_params['criterion'],
                                         max_depth=best_params['max_depth'],
                                         min_samples_split=best_params['min_samples_split'],
                                         min_samples_leaf=best_params['min_samples_leaf'],
                                         max_features=best_params['max_features'],
                                         random_state=42)

In [47]:
best_classifier.fit(X_train_tfidf, y_train)

In [48]:
y_pred = best_classifier.predict(X_test_tfidf)

In [50]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Decision Tree: ", accuracy)

Accuracy of DecisionTree:  0.7637970852910727


### Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

In [53]:
classifier = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [54]:
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
best_classifier = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                                         max_depth=best_params['max_depth'],
                                         min_samples_split=best_params['min_samples_split'],
                                         min_samples_leaf=best_params['min_samples_leaf'],
                                         max_features=best_params['max_features'],
                                         random_state=42)

In [None]:
best_classifier.fit(X_train_tfidf, y_train)
y_pred = best_classifier.predict(X_test_tfidf)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest", accuracy)