In [5]:
import pandas as pd
import nltk, re
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('stopwords')
words = set(stopwords.words('english'))


def cleandata(file):
    def process_text(text):
        if not isinstance(text, str):
            return []
        text = re.sub(r'[^\w\s]', '', text.lower())
        text = re.sub(r'\d+', '', text)
        tokens = word_tokenize(text)
        return [token for token in tokens if token not in words]

    sentences = file['tweet'].apply(process_text)
    labels = file['label'].values
    mask = sentences.apply(len) > 0
    sentences = sentences[mask].tolist()
    labels = labels[mask]
    print(f"Cleaned data: {len(sentences)} sentences, {len(labels)} labels")
    return sentences, labels

file = pd.read_csv('./data/clean_COVIDSenti.csv')
sentences, labels = cleandata(file)
cleanfile = pd.DataFrame({'tweet': sentences,'label': labels})
cleanfile.to_csv("./data/cleaned.csv", index=False)

[nltk_data] Downloading package punkt to C:\Users\manafeng-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\manafeng-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaned data: 90000 sentences, 90000 labels


In [2]:
import multiprocessing
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


def extract_features(tokens, model_path, vector_size):
        model = Word2Vec.load(model_path)
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(vectors if vectors else [np.zeros(vector_size)], axis=0)


def save_test_features(x, y, method, n_jobs=-1):
    unilabels, counts = np.unique(y, return_counts=True)
    print(dict(zip(unilabels, counts)))
    np.save(f"{method}-features.npy", x)
    np.save(f"{method}-labels.npy", y)
    print(f"{method} 's npy files have saved")

    x = np.array(x)
    y = np.array(y)

    trainx, testx, trainy, testy = train_test_split(x, y, test_size=0.2, random_state=42)

    for estimate in [100, 200, 300, 400, 500]:
        for depth in [5, 10, 20]:
            for feature in ['sqrt', 'log2', 0.3]:
                    rt = RandomForestClassifier(
                    n_estimators = estimate,
                    max_depth = depth,
                    max_features= feature,
                    random_state=42,
                    n_jobs=n_jobs,
                    class_weight='balanced'
                    )
                    rt.fit(trainx, trainy)
                    predy = rt.predict(testx)

                    print(f"Results with Random Forest:")
                    print(f"estimate: {estimate}, depth: {depth}, feature: {feature}")
                    print(f"accuracy: {accuracy_score(testy, predy):.4f}")
                    print("accuracy rate:")
                    print(classification_report(testy, predy, target_names=['Negative', 'Neutral', 'Positive']))




def word2vec(file_path, vector_size=300, window=5, min_count=5):
    file = pd.read_csv(file_path, header=0, names=['tweet', 'label'])
    all_sentences = file['tweet'].tolist()
    labels = file['label'].values

    model = Word2Vec(
        sg =1,
        epochs = 60,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=multiprocessing.cpu_count(),
    )
    model.build_vocab(all_sentences)
    model.train(all_sentences, total_examples=len(all_sentences), epochs=model.epochs)

    model_path = "word2vec.model"
    model.save(model_path)
    print("final model has saved")

    x = [extract_features(tokens, model_path, vector_size) for tokens in all_sentences]
    x = np.array(x)
    print(f"{x.shape} features has extracted")

    save_test_features(x, labels, "word2vec")
    print("word2vec process finished")
    print("now, saving features......")



file = "./data/cleaned.csv"
word2vec(file)


final model has saved
(90000, 300) features has extracted
{-1: 16335, 0: 67385, 1: 6280}
word2vec 's npy files have saved
Results with Random Forest:
estimate: 100, depth: 5, feature: sqrt
accuracy: 0.3987
accuracy rate:
              precision    recall  f1-score   support

    Negative       0.24      0.59      0.34      3257
     Neutral       0.83      0.36      0.50     13478
    Positive       0.11      0.36      0.17      1265

    accuracy                           0.40     18000
   macro avg       0.39      0.44      0.34     18000
weighted avg       0.67      0.40      0.45     18000

Results with Random Forest:
estimate: 100, depth: 5, feature: log2
accuracy: 0.4028
accuracy rate:
              precision    recall  f1-score   support

    Negative       0.24      0.59      0.34      3257
     Neutral       0.83      0.36      0.50     13478
    Positive       0.11      0.36      0.17      1265

    accuracy                           0.40     18000
   macro avg       0.39    