In [6]:
import pandas as pd
import nltk, re
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('stopwords')
words = set(stopwords.words('english'))


def cleandata(file):
    def process_text(text):
        if not isinstance(text, str):
            return []
        text = re.sub(r'[^\w\s]', '', text.lower())
        text = re.sub(r'\d+', '', text)
        tokens = nltk.word_tokenize(text)
        return [token for token in tokens if token not in words]

    sentences = file['tweet'].apply(process_text)
    labels = file['label'].values
    mask = sentences.apply(len) > 0
    sentences = sentences[mask].tolist()
    labels = labels[mask]
    print(f"Cleaned data: {len(sentences)} sentences, {len(labels)} labels")
    return sentences, labels

file = pd.read_csv('./data/clean_COVIDSenti.csv')
sentences, labels = cleandata(file)
cleanfile = pd.DataFrame({'tweet': sentences,'label': labels})
cleanfile.to_csv("./data/cleaned.csv", index=False)

[nltk_data] Downloading package punkt to C:\Users\manafeng-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\manafeng-
[nltk_data]     local\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaned data: 90000 sentences, 90000 labels


In [7]:
import multiprocessing
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


def extract_features(tokens, model_path, vector_size):
        model = Word2Vec.load(model_path)
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(vectors if vectors else [np.zeros(vector_size)], axis=0)


def save_test_features(x, y, method, n_jobs=-1):
    unilabels, counts = np.unique(y, return_counts=True)
    print(dict(zip(unilabels, counts)))
    np.save(f"{method}-features.npy", x)
    np.save(f"{method}-labels.npy", y)
    print(f"{method} 's npy files have saved")

    x = np.array(x)
    y = np.array(y)

    trainx, testx, trainy, testy = train_test_split(x, y, test_size=0.2, random_state=42)

    for estimate in [100, 200, 300, 400, 500]:
        for depth in [5, 10, 20]:
            for feature in ['sqrt', 'log2', 0.3]:
                    rt = RandomForestClassifier(
                    n_estimators = estimate,
                    max_depth = depth,
                    max_features= feature,
                    random_state=42,
                    n_jobs=n_jobs
                    )
                    rt.fit(trainx, trainy)
                    predy = rt.predict(testx)

                    print(f"Results with Random Forest:")
                    print(f"estimate: {estimate}, depth: {depth}, feature: {feature}")
                    print(f"accuracy: {accuracy_score(testy, predy):.4f}")
                    print("accuracy rate:")
                    print(classification_report(testy, predy, target_names=['Negative', 'Neutral', 'Positive']))
     
model_path = "word2vec.model"
vector_size = 300

def word2vec(file_path, vector_size=300, window=5, min_count=5):
    file = pd.read_csv(file_path, header=0, names=['tweet', 'label'])
    all_sentences = file['tweet'].tolist()
    labels = file['label'].values

    model = Word2Vec(
        sg =1,
        epochs = 60,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=multiprocessing.cpu_count(),
    )
    model.build_vocab(all_sentences)
    model.train(all_sentences, total_examples=len(all_sentences), epochs=model.epochs)

    model.save(model_path)





file_path = "./data/cleaned.csv"
word2vec(file_path)

file = pd.read_csv(file_path, header=0, names=['tweet', 'label'])
all_sentences = file['tweet'].tolist()

x = [extract_features(tokens, model_path, vector_size) for tokens in all_sentences]
x = np.array(x)
print(f"{x.shape} features has extracted")
print(x)


(90000, 300) features has extracted
[[ 3.7541820e-05 -6.4236864e-02 -3.4089450e-02 ... -1.9345972e-01
   2.0742174e-02 -1.8445931e-03]
 [-6.9524585e-03 -6.0245410e-02 -2.1768725e-02 ... -1.5030876e-01
   4.1639283e-03 -1.7745428e-04]
 [ 3.5694058e-03 -5.4090854e-02 -2.7257336e-02 ... -1.6944167e-01
   2.0664169e-02 -4.4091153e-03]
 ...
 [-1.3736335e-03 -5.6794941e-02 -2.6529003e-02 ... -1.6168427e-01
   6.7730737e-03  1.5622873e-03]
 [ 2.8405252e-03 -6.1024889e-02 -2.5155567e-02 ... -1.7874061e-01
   1.3277697e-02 -1.6294901e-03]
 [ 1.2770551e-02 -4.8609763e-02 -1.9474313e-02 ... -1.7441468e-01
   1.0854853e-02  9.0264603e-03]]


In [8]:
print(len(x[0]))

def five_fold_validation(X, y):

    folds = len(X) // 5
    for i in range(5):
        start = i * folds
        end = start + folds if i < 4 else len(X)
        x_train = np.concatenate((X[:start], X[end:]))
        y_train = np.concatenate((y[:start], y[end:]))
        x_test = X[start:end]
        y_test = y[start:end]

        rf = RandomForestClassifier(
            n_estimators=500,
            max_depth=20,
            max_features=0.3,
            random_state=42,
            n_jobs=-1
        )

        rf.fit(x_train, y_train)
        predy = rf.predict(x_test)
        print(f"Fold {i + 1} results:")
        print(f"accuracy: {accuracy_score(y_test, predy):.4f}")

five_fold_validation(x, labels)

300
Fold 1 results:
accuracy: 0.7711
Fold 2 results:
accuracy: 0.7636
Fold 3 results:
accuracy: 0.7566
Fold 4 results:
accuracy: 0.7381
Fold 5 results:
accuracy: 0.7344
