# Setup

In [1]:
import asyncio
from collections import Counter
import json
import re
from pathlib import Path

from googletrans import Translator
from langdetect import detect, LangDetectException
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import polars as pl
from sklearn.preprocessing import MinMaxScaler
import spacy
from stopwordsiso import stopwords
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
from wordcloud import WordCloud

import utils
from utils import PATHS

  import pkg_resources


In [2]:
# nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_sm")

# Tools

In [3]:
def extract_tokens(text: str) -> list[str]:
    """Firstly remove all non alphabetic characters, having in mind that
    there are Unicode characters. After tokenization we remove stopwords (ENG)
    and  remove all 1-2 chars lenghts words.

    Finally lemmatize tokens.
    """
    clean = re.sub(r"[\W]+", " ", text.lower())
    clean = re.sub(r"[\d]+", " ", clean)
    words = nltk.word_tokenize(clean)
    words = [w for w in words if w not in stopwords("en") and len(w) > 2]
    doc = nlp(" ".join(words))
    return [token.lemma_.lower() for token in doc if token.is_alpha]

# Load

In [4]:
with (PATHS.data / "games_reviews.json").open("rb") as f:
    data = json.load(f)

In [5]:
example_data = data[0]

In [6]:
example_data.keys()

dict_keys(['title', 'game_details', 'game_stats', 'game_reviews'])

In [7]:
Counter(extract_tokens(example_data["game_details"]["long_description"]))

Counter({'brass': 5,
         'industry': 5,
         'action': 5,
         'birmingham': 4,
         'game': 4,
         'build': 3,
         'network': 3,
         'canal': 3,
         'rail': 3,
         'story': 2,
         'predecessor': 2,
         'develop': 2,
         'establish': 2,
         'era': 2,
         'vps': 2,
         'tile': 2,
         'original': 2,
         'sell': 2,
         'loan': 2,
         'card': 2,
         'wild': 2,
         'economic': 1,
         'strategy': 1,
         'sequel': 1,
         'martin': 1,
         'wallace': 1,
         'masterpiece': 1,
         'tell': 1,
         'compete': 1,
         'entrepreneur': 1,
         'industrial': 1,
         'revolution': 1,
         'offer': 1,
         'arc': 1,
         'experience': 1,
         'effort': 1,
         'exploit': 1,
         'market': 1,
         'demand': 1,
         'play': 1,
         'halves': 1,
         'win': 1,
         'score': 1,
         'count': 1,
         'flip': 1,
 

In [8]:
def recognize_language(text: str) -> str:
    try:
        lang = detect(text)
    except LangDetectException:
        lang = "en" #TODO: set special flag
    return lang

@utils.with_pickle_async()
async def translate_multiple(texts: list[str], lang_from: list[str], lang_to: list[str]) -> list[str]:
    async with Translator() as translator:
        async def translate_one(text, src, dest):
            t = await translator.translate(text=text, src=src, dest=dest)
            return t.text
        return await tqdm_asyncio.gather(
            *(translate_one(text, from_, to_) for text, from_, to_ in zip(texts, lang_from, lang_to, strict=True))
        )

In [9]:
example_game_reviews = [info["review"] for info in example_data["game_reviews"]]
from_langs =  [recognize_language(r) for r in example_game_reviews]
translactions = await translate_multiple(example_game_reviews, from_langs, ["en"] * len(from_langs), pickle_path=PATHS.pickles / "example_translations")

100%|██████████| 3800/3800 [00:08<00:00, 423.20it/s]


In [10]:
example_game_reviews

['2-4 [S]',
 'Great game that somehow feels simple, yet every decision you make carries weight\n',
 "If you can play this in 60 minutes you're playing too fast.  With my group a a game lasts about 45 minutes per player.\n\nKeep pushing up my rating..  This game continues to be my favorite and the favorite of our group.",
 'Tabletop',
 'I only played half a real game, but I was captivated throughout. Would play again.',
 'Ship',
 "The more I play, the more I realize it's more about playing the hand you're dealt, and less about strategy",
 'Very good game. I might not say it is the best of all times, but it is definetely great. The mechanics are simple, but I consider that it takes a little time to adapt to them. You may need several playthroughs to achieve some sort of mastering. Revise every victory point possibility to plan your strategy, otherwise, it can be easy to fall pursuing things that are not profitable in the long term.',
 "+ Artwork is great\n+ Great strategy game\n+ Lots of

In [11]:
translactions

['2-4 [S]',
 'Great game that somehow feels simple, yet every decision you make carries weight',
 "If you can play this in 60 minutes you're playing too fast.  With my group a a game lasts about 45 minutes per player.\n\nKeep pushing up my rating..  This game continues to be my favorite and the favorite of our group.",
 'Table top',
 'I only played half a real game, but I was captivated throughout. Would play again.',
 'SGI',
 "The more I play, the more I realize it's more about playing the hand you're dealt, and less about strategy",
 'Very good game. I might not say it is the best of all times, but it is definetely great. The mechanics are simple, but I consider that it takes a little time to adapt to them. You may need several playthroughs to achieve some sort of mastering. Revise every victory point possibility to plan your strategy, otherwise, it can be easy to fall pursuing things that are not profitable in the long term.',
 "+ Artwork is great\n+ Great strategy game\n+ Lots of i

In [12]:
def extract_tokens(text: str) -> list[str]:
    """Firstly remove all non alphabetic characters, having in mind that
    there are Unicode characters. After tokenization we remove stopwords (ENG+PL)
    and  remove all 1-2 chars lenghts words.

    Finally lemmatize tokens.
    """
    clean = re.sub(r"[\W]+", " ", text.lower())
    clean = re.sub(r"[\d]+", " ", clean)
    words = nltk.word_tokenize(clean)
    words = [w for w in words if w not in (word_difficulty.STOPWORDS_PL | word_difficulty.STOPWORDS_EN) and len(w) > 2]
    doc = nlp(" ".join(words))
    return [token.lemma_.lower() for token in doc if token.is_alpha]

### Sentiment analysis

In [13]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        sentiment = 1
    elif scores['compound'] <= -0.05:
        sentiment = -1
    else:
        sentiment = 0
    return sentiment

In [14]:
sentiments = list(map(get_sentiment, translactions))
list(zip(sentiments, translactions))

[(0, '2-4 [S]'),
 (1,
  'Great game that somehow feels simple, yet every decision you make carries weight'),
 (1,
  "If you can play this in 60 minutes you're playing too fast.  With my group a a game lasts about 45 minutes per player.\n\nKeep pushing up my rating..  This game continues to be my favorite and the favorite of our group."),
 (1, 'Table top'),
 (1,
  'I only played half a real game, but I was captivated throughout. Would play again.'),
 (0, 'SGI'),
 (1,
  "The more I play, the more I realize it's more about playing the hand you're dealt, and less about strategy"),
 (1,
  'Very good game. I might not say it is the best of all times, but it is definetely great. The mechanics are simple, but I consider that it takes a little time to adapt to them. You may need several playthroughs to achieve some sort of mastering. Revise every victory point possibility to plan your strategy, otherwise, it can be easy to fall pursuing things that are not profitable in the long term.'),
 (1,
 

# Unsupervised Text Classification

In [15]:
import polars as pl
from sklearn.datasets import fetch_20newsgroups

# load data
train = fetch_20newsgroups(subset='train', shuffle=False)
test = fetch_20newsgroups(subset='test', shuffle=False)

# parse data to pandas DataFrames
newsgroup_test = pd.DataFrame({'article':test.data, 'class_index':test.target})
newsgroup_train = pd.DataFrame({'article':train.data, 'class_index':train.target})

# load labels with keywords
labels = pd.read_csv('20newsgroups_keywords.csv',sep=';')



HTTPError: HTTP Error 403: Forbidden