In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, lil_matrix
#from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from nltk import word_tokenize
import nltk
from nlp import load_dataset
import os
nltk.download('punkt')

In [None]:
DATA_FOLDER = 'data_preprocessed/combined'
dataset = load_dataset(os.path.join(DATA_FOLDER, 'de_politik_news.py'), cache_dir=os.path.join(DATA_FOLDER, '.de-politic-news'))

Create vocabulary

In [None]:
word_set = set()
for text in dataset['train']['text']:
    word_set.update(word_tokenize(text.lower()))
for text in dataset['validation']['text']:
    word_set.update(word_tokenize(text.lower()))
for text in dataset['test']['text']:
    word_set.update(word_tokenize(text.lower()))

word_dict =  { word : i for i,word in enumerate(list(word_set))}

Create BOW train vectors

In [None]:
embeddings = lil_matrix((len(dataset['train']['text']), len(word_set)), dtype=np.int8)
#embeddings = csr_matrix((len(dataset['train']['text']), len(word_set)), dtype=np.int8)#.toarray()
for i, text in enumerate(dataset['train']['text']):
    for word in word_tokenize(text.lower()):
        if word in word_dict:
            embeddings[i, word_dict[word]] += 1

Create BOW validation vectors

In [None]:
embeddings_valid = lil_matrix((len(dataset['validation']['text']), len(word_set)), dtype=np.int8)
for i, text in enumerate(dataset['validation']['text']):
    for word in word_tokenize(text.lower()):
        if word in word_dict:
            embeddings_valid[i, word_dict[word]] += 1

Create BOW test vectors

In [None]:
embeddings_test = lil_matrix((len(dataset['test']['text']), len(word_set)), dtype=np.int8)
for i, text in enumerate(dataset['test']['text']):
    for word in word_tokenize(text.lower()):
        if word in word_dict:
            embeddings_test[i, word_dict[word]] += 1

Initialize and train Random Forest

In [None]:
n_estimators = 20
model = RandomForestClassifier(n_estimators=n_estimators)
model = model.fit(embeddings, dataset['train']['class'])

Print accuracy on the test dataset

In [None]:
class_test = dataset['test']['class']

model_test = model.predict(embeddings_test)
accuracy = accuracy_score(class_test, model_test)
print(f'accuracy: {accuracy}')

num_non_equal = 0
for label,pred in zip(class_test, model_test.tolist()):
    if label!=pred:
        num_non_equal +=1
        #print(f'{label} {pred}')