# Word2Vec Training + Inference

In [23]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec

Load German Word2Vec model:

In [24]:
stop_words = set(stopwords.words('german'))

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Data Preprocessing

In [25]:
gc_news = pd.read_csv("../../data/gcnews/gc_news_unlabeled.csv")

In [26]:
labeled_data = pd.read_csv('../../data/ground_truth/labeled_data.csv', header=None, names=["text", "label"])
#X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], train_size=100, random_state=24)

print(labeled_data.groupby("label").count())

       text
label      
0       199
1       199


In [27]:
X = labeled_data['text'].apply(preprocess)
gc_news_text = gc_news.Content.apply(preprocess)

# Train Word2Vec

The Word2Vec model is trained with the entire unlabeled `gc_news` text data. 

In [28]:
sentences = [sentence.split() for sentence in gc_news_text]
w2v_model = Word2Vec(sentences, vector_size=300, window=5, min_count=5, workers=4)

# Vectorization

In [29]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(300)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

# Validation

A Logistic Regression is fitted to 100 training observations and tested against the remaining labeled data. This process is repeated with five different random states. 

In [30]:
random_states = [100,200,300,400,500]
output = []
pr_auc_scores = []

X = np.array([vectorize(sentence) for sentence in labeled_data['text']])
y = labeled_data['label']

for rs in random_states:
    train_stats, test_stats, train_labels, test_labels = train_test_split(X, y, train_size=100, stratify=y, random_state=rs)
    log = LogisticRegressionCV(cv=5, max_iter=500, random_state=42)
    log.fit(train_stats, train_labels)
    output.append(classification_report(test_labels, log.predict(test_stats), output_dict=True))
    print("Random state %d: Achieved f1 = %.2f w/ LogReg in training." % (
        rs, 
        classification_report(
            train_labels, 
            log.predict(train_stats), 
            output_dict=True
        )['weighted avg']['f1-score']
    ))
    precision, recall,_= precision_recall_curve(test_labels, log.predict_proba(test_stats)[:,1]) 
    pr_auc = auc(recall, precision)
    # Print the PR-AUC Score 
    print("PR-AUC Score:", pr_auc)
    pr_auc_scores.append(pr_auc)

    
scores = pd.json_normalize(output)

accuracy = (np.mean(scores.accuracy), np.std(scores.accuracy))
precision = np.mean(scores['weighted avg.precision']), np.std(scores['weighted avg.precision'])
recall = np.mean(scores['weighted avg.recall']), np.std(scores['weighted avg.recall'])
f1 = np.mean(scores['weighted avg.f1-score']), np.std(scores['weighted avg.f1-score'])
pr_auc = np.mean(pr_auc_scores), np.std(pr_auc_scores)

print(
"accuracy: \t %.4f +- %.4f \n"
"precision: \t %.4f +- %.4f \n"
"recall: \t %.4f +- %.4f \n"
"f1-score: \t %.4f +- %.4f \n"
"PR-AUC: \t %.4f +- %.4f \n" % (accuracy[0], accuracy[1], 
                                precision[0], precision[1],
                                recall[0], recall[1],
                                f1[0], f1[1],
                                pr_auc[0], pr_auc[1],
                                  ))

Random state 100: Achieved f1 = 0.76 w/ LogReg in training.
PR-AUC Score: 0.6066236040969879
Random state 200: Achieved f1 = 0.65 w/ LogReg in training.
PR-AUC Score: 0.5792685108354013
Random state 300: Achieved f1 = 0.80 w/ LogReg in training.
PR-AUC Score: 0.6228374216330359
Random state 400: Achieved f1 = 0.99 w/ LogReg in training.
PR-AUC Score: 0.5183072412417644
Random state 500: Achieved f1 = 0.92 w/ LogReg in training.
PR-AUC Score: 0.5723897922228874
accuracy: 	 0.5477 +- 0.0287 
precision: 	 0.5501 +- 0.0305 
recall: 	 0.5477 +- 0.0287 
f1-score: 	 0.5431 +- 0.0275 
PR-AUC: 	 0.5799 +- 0.0358 



# Inference

A Logistic Regression is fitted to the entire labeled data. The fitted Regression is used to predict labels for the `gc_news` data. 

In [31]:
X = np.array([vectorize(sentence) for sentence in labeled_data['text']])
y = labeled_data['label']

log = LogisticRegressionCV(cv=5, max_iter=10000, random_state=42)
log.fit(X, y)

print(classification_report(y, log.predict(X)))

              precision    recall  f1-score   support

           0       0.79      0.82      0.81       199
           1       0.82      0.78      0.80       199

    accuracy                           0.80       398
   macro avg       0.80      0.80      0.80       398
weighted avg       0.80      0.80      0.80       398



In [32]:
X_infer = np.array([vectorize(sentence) for sentence in gc_news_text])
y_infer = log.predict(X_infer)
y_proba = log.predict_proba(X_infer)

In [33]:
gc_news["label"] = y_infer
gc_news["proba"] = y_proba[:,1]

Uncomment to export labeled data:

In [34]:
# gc_news.to_csv("gc_news_w2v_labels.csv", index=False)