In [1]:
from datasets import load_dataset
import sentencepiece as spm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import nltk
from nltk.tokenize import sent_tokenize
import random
from datasets import concatenate_datasets
from datasets import Dataset
import requests
from bs4 import BeautifulSoup

## Function For Scraping

In [2]:
def neutral_sentiment_function(urls):
    articles_content = []
    for url in urls: #fill articles_content with content from url's
        response = requests.get(url)
        # Adjust encoding based on the site's specific encoding
        response.encoding = 'UTF-8'
        soup = BeautifulSoup(response.text, "html.parser")
        # Adjust the selection based on the content structure of the site
        article_content = soup.find("div", class_="mw-page-container-inner") 
        if article_content:
            articles_content.append(article_content.get_text())#creating a data frame of sentences
   # nltk.download('punkt')
    all_sentences = []
    for content in articles_content:
        sentences = sent_tokenize(content)
        all_sentences.extend(sentences)
    df = pd.DataFrame(all_sentences, columns=['text'])
    df.reset_index(drop=True, inplace=True)
    print(df.head())
    #df['data'] = df['text'].apply(lambda x: {'label': 2, 'text': x}) #every neutral will be referenced with a 2
    #neutral_data = df['data'].tolist()
    return df

## Function For Combining

In [3]:
def combine_data(df,sentiment_type,train_dataset,test_dataset):
    sentiment_number = 0
    if sentiment_type == 'positive':
        sentiment_number = 1
    elif sentiment_type == 'negative':
        sentiment_number = 0
    elif sentiment_type == 'neutral':
        sentiment_number = 2

    df['data'] = df['text'].apply(lambda x: {'label': sentiment_number, 'text': x}) #every neutral will be referenced with a 2
    added_data = df['data'].tolist()
    total_length = len(added_data)
    split_index = int(total_length * 0.75)
    added_training_data = added_data[:split_index]
    data_dict = {key: [dic[key] for dic in added_training_data] for key in added_training_data[0]}
    added_training_dataset = Dataset.from_dict(data_dict)
    combined_training_dataset = concatenate_datasets([train_dataset,added_training_dataset])
    train_dataset = combined_training_dataset.shuffle(seed = 42)

    added_testing_data = added_data[split_index:]
    data_dict = {key: [dic[key] for dic in added_testing_data] for key in added_testing_data[0]}
    added_testing_dataset = Dataset.from_dict(data_dict)
    combined_testing_dataset = concatenate_datasets([test_dataset,added_testing_dataset])
    test_dataset = combined_testing_dataset.shuffle(seed=42)

    return train_dataset, test_dataset

## Loading the Datasets

In [None]:
# Load the dataset
#dataset = load_dataset("t1annnnn/Chinese_sentimentAnalyze")
dataset = load_dataset("sepidmnorozy/Chinese_sentiment")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
dataset_2 = load_dataset("t1annnnn/Chinese_sentimentAnalyze")
test_dataset_2 = dataset["test"]

In [None]:
#print(test_dataset_2[1])

## Including a Neutral Sentiment

In [None]:
# Specify the URL
#nltk.download('punkt')
urls = ["https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%9C%8B", "https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD"]
df = neutral_sentiment_function(urls)
sentiment_type = 'neutral'
train_dataset,test_dataset = combine_data(df,sentiment_type,train_dataset,test_dataset)

## Tokenizing and Vectorizing

In [None]:
# Create and write to chinese_data.txt for SnetencePiece training
with open("chinese_data.txt", "w", encoding="utf-8") as file:
    for example in train_dataset:
        file.write(example["text"] + "\n")

#Training the data set model:
spm.SentencePieceTrainer.Train('--input=chinese_data.txt --model_prefix=chinese_model --vocab_size=8000 --character_coverage=0.9995 --model_type=bpe')

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load("chinese_model.model")
tokenized_texts = [sp.EncodeAsPieces(text) for text in train_dataset['text']]
tokenized_test_texts = [sp.EncodeAsPieces(text) for text in test_dataset['text']]
joined_texts = [' '.join(tokens) for tokens in tokenized_texts]
joined_test_texts = [' '.join(tokens) for tokens in tokenized_test_texts]
tokenized_test_texts_2 = [sp.EncodeAsPieces(text) for text in test_dataset_2['text']]
joined_test_texts_2 = [' '.join(tokens) for tokens in tokenized_test_texts_2]
vectorizer = TfidfVectorizer(ngram_range = (1,3))

## Creating Feature variables

In [None]:
# Fitting the model and transforming the text data into TF-IDF vectors
X_train = vectorizer.fit_transform(joined_texts)
X_test = vectorizer.transform(joined_test_texts)
X_test_2 = vectorizer.transform(joined_test_texts_2)

## Creating Target Variables

In [None]:
y_train = [data['label'] for data in train_dataset]
y_test = [data['label'] for data in test_dataset]
y_test_2 = [data['label'] for data in test_dataset_2]

## Creating and Testing LogisticRegression model

In [None]:
default_model = LogisticRegression()

# Train the model
default_model.fit(X_train, y_train)
# Evaluate the model
default_predictions = default_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, default_predictions))
print(classification_report(y_test, default_predictions))

In [None]:
'''model_1 = LogisticRegression(C=0.01, penalty = 'l1', solver = 'saga') #c=0.01

model_1.fit(X_train, y_train)
# Evaluate the model
model_1_predictions = model_1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, model_1_predictions))
print(classification_report(y_test, model_1_predictions))

In [None]:
'''model_1 = LogisticRegression(C=0.1, penalty = 'l1', solver = 'saga')#c=0.1

model_1.fit(X_train, y_train)
# Evaluate the model
model_1_predictions = model_1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, model_1_predictions))
print(classification_report(y_test, model_1_predictions))'''

In [None]:
model_1 = LogisticRegression(C=1, penalty = 'l2', solver = 'saga') #c=1

model_1.fit(X_train, y_train)
# Evaluate the model
model_1_predictions = model_1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, model_1_predictions))
print(classification_report(y_test, model_1_predictions))

In [None]:
model_1 = LogisticRegression(C=1, penalty = 'l1', solver = 'liblinear') #c=1

model_1.fit(X_train, y_train)
# Evaluate the model
model_1_predictions = model_1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, model_1_predictions))
print(classification_report(y_test, model_1_predictions))

In [None]:
logistic_regression_model = LogisticRegression()

'''param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # solvers that support l1 penalty
}

print("gridsearch")
# Setup the grid search
grid_search = GridSearchCV(logistic_regression_model, param_grid, cv=5, scoring='accuracy')

print("fit")
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)'''

In [None]:
logistic_regression_model.fit(X_train, y_train)

y_pred = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

In [None]:
'''incorrect_texts = []
correct_texts = []
incorrect_predictions = []
correct_labels = []

for i in range(len(y_test)):
    if y_pred[i] != y_test[i]:
        incorrect_texts.append(joined_test_texts[i])
        incorrect_predictions.append(y_pred[i])
        correct_labels.append(y_test[i])
    else:
        correct_texts.append(joined_test_texts[i])

average_incorrect_length = []
sum_incorrect_length = 0
for j in range(len(incorrect_texts)):
    sum_incorrect_length = sum_incorrect_length + len(incorrect_texts[j])

average_incorrect_length = sum_incorrect_length/j

average_correct_length = []
sum_correct_length = 0
for h in range(len(correct_texts)):
    sum_correct_length = sum_correct_length + len(correct_texts[h])

average_correct_length = sum_correct_length/h

print(f"Average correct length: {average_correct_length}")
print(f"Average incorrect length: {average_incorrect_length}")
print(incorrect_texts[5])
print(len(incorrect_texts[5]))'''

In [None]:
'''sgd_classifier = SGDClassifier(loss='hinge',  # Use 'hinge' for linear SVM, 'log' for logistic regression
                               max_iter=1000,
                               tol=1e-3,
                               random_state=42)
sgd_classifier.fit(X_train, y_train)

y_pred = sgd_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))'''

In [None]:
'''svm_classifier = SVC(kernel='linear') 
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))'''

'''linear_svc_model = LinearSVC(max_iter=1000) 
linear_svc_model.fit(X_train, y_train)
y_pred = linear_svc_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))'''

## Naive Bayes

In [None]:
'''naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
y_pred_NB = naive_bayes_model.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
print("Accuracy:", accuracy_score(y_test, y_pred_NB))

print(classification_report(y_test, y_pred_NB))
'''

## Random Forest

In [None]:
'''random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)  
random_forest_model.fit(X_train, y_train)
y_pred_RF = random_forest_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))'''

## SGDClassifier

In [None]:
#sgd_classifier = SGDClassifier(alpha = 0.0001, loss = 'log_loss', learning_rate = 'optimal', random_state=42, penalty = 'l2')
sgd_classifier = SGDClassifier()

sgd_classifier.fit(X_train, y_train)
y_pred = sgd_classifier.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Hyperparameter searching