In [22]:
from datasets import load_dataset
import sentencepiece as spm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

## Loading the Datasets

In [2]:
# Load the dataset
dataset = load_dataset("t1annnnn/Chinese_sentimentAnalyze")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

## Tokenizing and Vectorizing

In [3]:
# Create and write to chinese_data.txt for SnetencePiece training
with open("chinese_data.txt", "w", encoding="utf-8") as file:
    for example in train_dataset:
        file.write(example["text"] + "\n")

#Training the data set model:
spm.SentencePieceTrainer.Train('--input=chinese_data.txt --model_prefix=chinese_model --vocab_size=8000 --character_coverage=0.9995 --model_type=bpe')

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load("chinese_model.model")

# Tokenize all texts in the training dataset
tokenized_texts = [sp.EncodeAsPieces(text) for text in train_dataset['text']]
#for testing
tokenized_test_texts = [sp.EncodeAsPieces(text) for text in test_dataset['text']]

# Converting token lists back to strings
joined_texts = [' '.join(tokens) for tokens in tokenized_texts]
# Joining the tokens into strings for each document in the test set
joined_test_texts = [' '.join(tokens) for tokens in tokenized_test_texts]

# Creating a TfidfVectorizer object



IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [17]:
vectorizer = TfidfVectorizer(ngram_range = (1,3))

## Creating Feature variables

In [18]:
# Fitting the model and transforming the text data into TF-IDF vectors
X_train = vectorizer.fit_transform(joined_texts)
X_test = vectorizer.transform(joined_test_texts)

In [5]:
'''

# Calculate the sum of TF-IDF scores for each feature (word)
sum_tfidf = np.array(X.sum(axis=0)).flatten()
#print(sum_tfidf)
# Calculate the average by dividing by the number of documents
average_tfidf = sum_tfidf / X.shape[0]

# Get feature names to map indices to words
feature_names = vectorizer.get_feature_names_out()

# Map from feature name to average tf-idf score
word_to_avg_score = {feature_names[i]: average_tfidf[i] for i in range(len(feature_names))}
'''

'\n\n# Calculate the sum of TF-IDF scores for each feature (word)\nsum_tfidf = np.array(X.sum(axis=0)).flatten()\n#print(sum_tfidf)\n# Calculate the average by dividing by the number of documents\naverage_tfidf = sum_tfidf / X.shape[0]\n\n# Get feature names to map indices to words\nfeature_names = vectorizer.get_feature_names_out()\n\n# Map from feature name to average tf-idf score\nword_to_avg_score = {feature_names[i]: average_tfidf[i] for i in range(len(feature_names))}\n'

## Creating Target Variables

In [19]:
y_train = [data['label'] for data in train_dataset]
y_test = [data['label'] for data in test_dataset]

## Creating and Testing LogisticRegression model

In [20]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

Accuracy: 0.7650599113640094
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      9089
           1       0.78      0.75      0.76      9188

    accuracy                           0.77     18277
   macro avg       0.77      0.77      0.77     18277
weighted avg       0.77      0.77      0.77     18277



In [37]:
incorrect_texts = []
correct_texts = []
incorrect_predictions = []
correct_labels = []

for i in range(len(y_test)):
    if y_pred[i] != y_test[i]:
        incorrect_texts.append(joined_test_texts[i])
        incorrect_predictions.append(y_pred[i])
        correct_labels.append(y_test[i])
    else:
        correct_texts.append(joined_test_texts[i])

average_incorrect_length = []
sum_incorrect_length = 0
for j in range(len(incorrect_texts)):
    sum_incorrect_length = sum_incorrect_length + len(incorrect_texts[j])

average_incorrect_length = sum_incorrect_length/j

average_correct_length = []
sum_correct_length = 0
for h in range(len(correct_texts)):
    sum_correct_length = sum_correct_length + len(correct_texts[h])

average_correct_length = sum_correct_length/h

print(f"Average correct length: {average_correct_length}")
print(f"Average incorrect length: {average_incorrect_length}")
print(incorrect_texts[5])
print(len(incorrect_texts[5]))

Average correct length: 76.6537824803858
Average incorrect length: 74.46622546782291
▁我 现在 ▁ 之 树 今天 也 阳光 明 媚 ▁我在这里
29


In [23]:
sgd_classifier = SGDClassifier(loss='hinge',  # Use 'hinge' for linear SVM, 'log' for logistic regression
                               max_iter=1000,
                               tol=1e-3,
                               random_state=42)
sgd_classifier.fit(X_train, y_train)

y_pred = sgd_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7601904032390436
              precision    recall  f1-score   support

           0       0.74      0.81      0.77      9089
           1       0.79      0.71      0.75      9188

    accuracy                           0.76     18277
   macro avg       0.76      0.76      0.76     18277
weighted avg       0.76      0.76      0.76     18277



In [8]:
'''svm_classifier = SVC(kernel='linear') 
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))'''

linear_svc_model = LinearSVC(max_iter=1000) 
linear_svc_model.fit(X_train, y_train)
y_pred = linear_svc_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.7617223833233026
              precision    recall  f1-score   support

           0       0.75      0.78      0.76      9089
           1       0.77      0.74      0.76      9188

    accuracy                           0.76     18277
   macro avg       0.76      0.76      0.76     18277
weighted avg       0.76      0.76      0.76     18277



## Naive Bayes

In [9]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
y_pred_NB = naive_bayes_model.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_NB))

print(classification_report(y_test, y_pred_NB))


Accuracy: 0.7607375389834218
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      9089
           1       0.76      0.76      0.76      9188

    accuracy                           0.76     18277
   macro avg       0.76      0.76      0.76     18277
weighted avg       0.76      0.76      0.76     18277



## Random Forest

In [10]:
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)  
random_forest_model.fit(X_train, y_train)
y_pred_RF = random_forest_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

KeyboardInterrupt: 