# Setup

In [1]:
import asyncio
from collections import Counter
import json
import re
from pathlib import Path

from googletrans import Translator
from langdetect import detect, LangDetectException
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import polars as pl
from sklearn.preprocessing import MinMaxScaler
import spacy
from stopwordsiso import stopwords
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
from wordcloud import WordCloud

import utils
from utils import PATHS

  import pkg_resources


In [2]:
nlp = spacy.load("en_core_web_sm")

# Tools

In [3]:
def extract_tokens(text: str) -> list[str]:
    """Firstly remove all non alphabetic characters, having in mind that
    there are Unicode characters. After tokenization we remove stopwords (ENG)
    and  remove all 1-2 chars lenghts words.

    Finally lemmatize tokens.
    """
    clean = re.sub(r"[\W]+", " ", text.lower())
    clean = re.sub(r"[\d]+", " ", clean)
    words = nltk.word_tokenize(clean)
    words = [w for w in words if w not in stopwords("en") and len(w) > 2]
    doc = nlp(" ".join(words))
    return [token.lemma_.lower() for token in doc if token.is_alpha]

# Load

In [4]:
with (PATHS.data / "games_reviews.json").open("rb") as f:
    data = json.load(f)

In [5]:
example_data = data[0]

In [6]:
example_data.keys()

dict_keys(['title', 'game_details', 'game_stats', 'game_reviews'])

In [7]:
Counter(extract_tokens(example_data["game_details"]["long_description"]))

Counter({'brass': 5,
         'industry': 5,
         'action': 5,
         'birmingham': 4,
         'game': 4,
         'build': 3,
         'network': 3,
         'canal': 3,
         'rail': 3,
         'story': 2,
         'predecessor': 2,
         'develop': 2,
         'establish': 2,
         'era': 2,
         'vps': 2,
         'tile': 2,
         'original': 2,
         'sell': 2,
         'loan': 2,
         'card': 2,
         'wild': 2,
         'economic': 1,
         'strategy': 1,
         'sequel': 1,
         'martin': 1,
         'wallace': 1,
         'masterpiece': 1,
         'tell': 1,
         'compete': 1,
         'entrepreneur': 1,
         'industrial': 1,
         'revolution': 1,
         'offer': 1,
         'arc': 1,
         'experience': 1,
         'effort': 1,
         'exploit': 1,
         'market': 1,
         'demand': 1,
         'play': 1,
         'halves': 1,
         'win': 1,
         'score': 1,
         'count': 1,
         'flip': 1,
 

In [10]:
def recognize_language(text: str) -> str:
    try:
        lang = detect(text)
    except LangDetectException:
        lang = "en" #TODO: set special flag
    return lang

@utils.with_pickle_async()
async def translate_multiple(texts: list[str], lang_from: list[str], lang_to: list[str]) -> list[str]:
    async with Translator() as translator:
        async def translate_one(text, src, dest):
            t = await translator.translate(text=text, src=src, dest=dest)
            return t.text
        return await tqdm_asyncio.gather(
            *(translate_one(text, from_, to_) for text, from_, to_ in zip(texts, lang_from, lang_to, strict=True))
        )

In [14]:
example_game_reviews = [info["review"] for info in example_data["game_reviews"]]
from_langs =  [recognize_language(r) for r in example_game_reviews]
translactions = await translate_multiple(example_game_reviews, from_langs, ["en"] * len(from_langs), pickle_path=PATHS.pickles / "example_translations")

In [13]:
translactions

["{'review': '2-4 [S]', 'reviewer_country': 'Italy', 'rating': 0.0, 'date': '2025-09-24 17:47:08', 'is_owner': True}",
 "{'review': 'Great game that somehow feels simple, yet every decision you make carries weight\\n', 'reviewer_country': '', 'rating': 8.5, 'date': '2021-02-25 08:42:44', 'is_owner': False}",
 '{\'review\': "If you can play this in 60 minutes you\'re playing too fast.  With my group a a game lasts about 45 minutes per player.\\n\\nKeep pushing up my rating..  This game continues to be my favorite and the favorite of our group.", \'reviewer_country\': \'United States\', \'rating\': 9.4, \'date\': \'2024-07-10 16:55:53\', \'is_owner\': True}',
 "{'review': 'Tabletop', 'reviewer_country': 'United States', 'rating': 0.0, 'date': '2025-02-01 17:35:35', 'is_owner': True}",
 "{'review': 'I only played half a real game, but I was captivated throughout. Would play again.', 'reviewer_country': 'United States', 'rating': 8.7, 'date': '2025-11-10 19:17:57', 'is_owner': False}",
 "{

In [None]:
data[0]["game_reviews"]

[{'review': '2-4 [S]',
  'reviewer_country': 'Italy',
  'rating': 0.0,
  'date': '2025-09-24 17:47:08',
  'is_owner': True},
 {'review': 'Great game that somehow feels simple, yet every decision you make carries weight\n',
  'reviewer_country': '',
  'rating': 8.5,
  'date': '2021-02-25 08:42:44',
  'is_owner': False},
 {'review': "If you can play this in 60 minutes you're playing too fast.  With my group a a game lasts about 45 minutes per player.\n\nKeep pushing up my rating..  This game continues to be my favorite and the favorite of our group.",
  'reviewer_country': 'United States',
  'rating': 9.4,
  'date': '2024-07-10 16:55:53',
  'is_owner': True},
 {'review': 'Tabletop',
  'reviewer_country': 'United States',
  'rating': 0.0,
  'date': '2025-02-01 17:35:35',
  'is_owner': True},
 {'review': 'I only played half a real game, but I was captivated throughout. Would play again.',
  'reviewer_country': 'United States',
  'rating': 8.7,
  'date': '2025-11-10 19:17:57',
  'is_owner':

In [None]:
def extract_tokens(text: str) -> list[str]:
    """Firstly remove all non alphabetic characters, having in mind that
    there are Unicode characters. After tokenization we remove stopwords (ENG+PL)
    and  remove all 1-2 chars lenghts words.

    Finally lemmatize tokens.
    """
    clean = re.sub(r"[\W]+", " ", text.lower())
    clean = re.sub(r"[\d]+", " ", clean)
    words = nltk.word_tokenize(clean)
    words = [w for w in words if w not in (word_difficulty.STOPWORDS_PL | word_difficulty.STOPWORDS_EN) and len(w) > 2]
    doc = nlp(" ".join(words))
    return [token.lemma_.lower() for token in doc if token.is_alpha]

# Unsupervised Text Classification

In [None]:
import polars as pl
from sklearn.datasets import fetch_20newsgroups

# load data
train = fetch_20newsgroups(subset='train', shuffle=False)
test = fetch_20newsgroups(subset='test', shuffle=False)

# parse data to pandas DataFrames
newsgroup_test = pd.DataFrame({'article':test.data, 'class_index':test.target})
newsgroup_train = pd.DataFrame({'article':train.data, 'class_index':train.target})

# load labels with keywords
labels = pd.read_csv('20newsgroups_keywords.csv',sep=';')

FileNotFoundError: [Errno 2] No such file or directory: '20newsgroups_keywords.csv'