In [31]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("scraped_data.db")

df = pd.read_sql_query("SELECT * FROM reviews", conn)
df.shape

(6945, 13)

In [32]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = English()
tokenizer = nlp.tokenizer

def tokenize_text(text):
    if isinstance(text, str):
        return [token.text for token in tokenizer(text)]
    else:
        return []

df['Tokenized_Long_Text'] = df['Long_Text'].apply(tokenize_text)


In [33]:
df.head()

Unnamed: 0,ID,URL,Scraped_at,Author,Posted,Modified,Title,Subtitle,Score,Verdict,Review_Text,Long_Text,Quotes,Tokenized_Long_Text
0,aad9a4c3-8a9b-4789-b239-8d6872da6b7e,https://www.ign.com/articles/arcs-board-game-r...,2025-03-30 15:20:27,Matt Thrower,2024-11-13T15:10:01.708Z,2024-11-13T15:20:56.240Z,Arcs Board Game Review,A deviously deep game of space conquest.,10.0,Arcs is not the first game to try and balance ...,Arcs is not the first game to try and balance ...,"Space conquest games are ten a penny, includin...",,"[Space, conquest, games, are, ten, a, penny, ,..."
1,3b738a45-14a4-4e9f-9c50-c37458bb5844,https://www.ign.com/articles/elden-ring-shadow...,2025-03-30 15:20:29,Mitchell Saltzman,2024-06-18T14:00:00.000Z,2024-06-25T22:46:45.861Z,Elden Ring: Shadow of the Erdtree DLC Review,Untarnished.,10.0,FromSoftware says Shadow of the Erdtree is the...,"Like the base game did before it, Elden Ring: ...","When I gave Elden Ring a 10 two years ago, I d...","According to our count, there are more than 40...","[When, I, gave, Elden, Ring, a, 10, two, years..."
2,85dfd49f-59ce-45cb-8b86-657c001455de,https://www.ign.com/articles/stardew-valley-re...,2025-03-30 15:20:30,Shailyn Cotten,2024-05-07T16:00:00.000Z,2024-07-12T17:14:15.989Z,Stardew Valley Review - 2024,Eight years of impressive updates have grown S...,10.0,Stardew Valley is not only the best farming ga...,"More than just a cozy farming sim, eight years...",Editor's Note: This review takes a fresh look ...,Multitasking efficiently is a deceptively tens...,"[Editor, 's, Note, :, This, review, takes, a, ..."
3,dc47b511-7668-4107-b7c0-8c63266ac963,https://www.ign.com/articles/asgards-wrath-2-r...,2025-03-30 15:20:31,Travis Northup,2023-12-14T14:00:00.000Z,2023-12-14T20:40:01.659Z,Asgard’s Wrath 2 Review,This open-world action RPG sets a new gold sta...,10.0,Asgard’s Wrath 2 is the full-fledged VR open-w...,Asgard’s Wrath 2 is an open-world action RPG t...,For those who have been patiently waiting on t...,This sequel improves and expands upon just abo...,"[For, those, who, have, been, patiently, waiti..."
4,003f6d65-549f-477c-822b-6f57a8a9105e,https://www.ign.com/articles/baldurs-gate-3-re...,2025-03-30 15:20:32,Leana Hafer,2023-08-18T23:32:25.376Z,2024-01-31T19:38:28.947Z,Baldur's Gate 3 Review,A new high-water mark for CRPGs with satisfyin...,10.0,I don't want to say every CRPG going forward s...,"With crunchy, tactical RPG combat, a memorable...","Every once in a long while, a game comes along...","This is a world that will rarely tell you, 'No...","[Every, once, in, a, long, while, ,, a, game, ..."


Weitere vorverarbeitung:
- kleinbuchstaben
- lemmatisierung
- Stoppwörter entfernen
- Zahlen und Satzzeichen entfernen

In [34]:
import spacy
from tqdm.notebook import tqdm

tqdm.pandas()


nlp = spacy.load("en_core_web_sm")


def preprocess_text(text):
    if not isinstance(text, str):
        return []

    
    doc = nlp(text.lower()) # kleinbuchstaben
    tokens = [
            token.lemma_ # lemmatisierung
            for token in doc
            if token.is_alpha and not token.is_stop # zahlen entfernen und stopwörter
        ]
    return tokens

df["Preprocessed_Long_Text"] = df["Long_Text"].progress_apply(preprocess_text)


  0%|          | 0/6945 [00:00<?, ?it/s]

In [35]:
df.head()

Unnamed: 0,ID,URL,Scraped_at,Author,Posted,Modified,Title,Subtitle,Score,Verdict,Review_Text,Long_Text,Quotes,Tokenized_Long_Text,Preprocessed_Long_Text
0,aad9a4c3-8a9b-4789-b239-8d6872da6b7e,https://www.ign.com/articles/arcs-board-game-r...,2025-03-30 15:20:27,Matt Thrower,2024-11-13T15:10:01.708Z,2024-11-13T15:20:56.240Z,Arcs Board Game Review,A deviously deep game of space conquest.,10.0,Arcs is not the first game to try and balance ...,Arcs is not the first game to try and balance ...,"Space conquest games are ten a penny, includin...",,"[Space, conquest, games, are, ten, a, penny, ,...","[space, conquest, game, penny, include, good, ..."
1,3b738a45-14a4-4e9f-9c50-c37458bb5844,https://www.ign.com/articles/elden-ring-shadow...,2025-03-30 15:20:29,Mitchell Saltzman,2024-06-18T14:00:00.000Z,2024-06-25T22:46:45.861Z,Elden Ring: Shadow of the Erdtree DLC Review,Untarnished.,10.0,FromSoftware says Shadow of the Erdtree is the...,"Like the base game did before it, Elden Ring: ...","When I gave Elden Ring a 10 two years ago, I d...","According to our count, there are more than 40...","[When, I, gave, Elden, Ring, a, 10, two, years...","[give, elden, ring, year, ago, incredible, gam..."
2,85dfd49f-59ce-45cb-8b86-657c001455de,https://www.ign.com/articles/stardew-valley-re...,2025-03-30 15:20:30,Shailyn Cotten,2024-05-07T16:00:00.000Z,2024-07-12T17:14:15.989Z,Stardew Valley Review - 2024,Eight years of impressive updates have grown S...,10.0,Stardew Valley is not only the best farming ga...,"More than just a cozy farming sim, eight years...",Editor's Note: This review takes a fresh look ...,Multitasking efficiently is a deceptively tens...,"[Editor, 's, Note, :, This, review, takes, a, ...","[editor, note, review, take, fresh, look, star..."
3,dc47b511-7668-4107-b7c0-8c63266ac963,https://www.ign.com/articles/asgards-wrath-2-r...,2025-03-30 15:20:31,Travis Northup,2023-12-14T14:00:00.000Z,2023-12-14T20:40:01.659Z,Asgard’s Wrath 2 Review,This open-world action RPG sets a new gold sta...,10.0,Asgard’s Wrath 2 is the full-fledged VR open-w...,Asgard’s Wrath 2 is an open-world action RPG t...,For those who have been patiently waiting on t...,This sequel improves and expands upon just abo...,"[For, those, who, have, been, patiently, waiti...","[patiently, wait, sideline, vr, reason, couch,..."
4,003f6d65-549f-477c-822b-6f57a8a9105e,https://www.ign.com/articles/baldurs-gate-3-re...,2025-03-30 15:20:32,Leana Hafer,2023-08-18T23:32:25.376Z,2024-01-31T19:38:28.947Z,Baldur's Gate 3 Review,A new high-water mark for CRPGs with satisfyin...,10.0,I don't want to say every CRPG going forward s...,"With crunchy, tactical RPG combat, a memorable...","Every once in a long while, a game comes along...","This is a world that will rarely tell you, 'No...","[Every, once, in, a, long, while, ,, a, game, ...","[long, game, come, memorable, exciting, fresh,..."


In [None]:
df["Preprocessed_Long_Text"] = df["Preprocessed_Long_Text"].apply(lambda tokens: " ".join(tokens) if isinstance(tokens, list) else "")
# SQLite kann die Liste nicht nehmen, daher einfach wörter getrennt mit " "

In [39]:
df["Tokenized_Long_Text"] = df["Tokenized_Long_Text"].apply(lambda tokens: " ".join(tokens) if isinstance(tokens, list) else "")


In [40]:
df.head()

Unnamed: 0,ID,URL,Scraped_at,Author,Posted,Modified,Title,Subtitle,Score,Verdict,Review_Text,Long_Text,Quotes,Tokenized_Long_Text,Preprocessed_Long_Text
0,aad9a4c3-8a9b-4789-b239-8d6872da6b7e,https://www.ign.com/articles/arcs-board-game-r...,2025-03-30 15:20:27,Matt Thrower,2024-11-13T15:10:01.708Z,2024-11-13T15:20:56.240Z,Arcs Board Game Review,A deviously deep game of space conquest.,10.0,Arcs is not the first game to try and balance ...,Arcs is not the first game to try and balance ...,"Space conquest games are ten a penny, includin...",,"Space conquest games are ten a penny , includi...",space conquest game penny include good war boa...
1,3b738a45-14a4-4e9f-9c50-c37458bb5844,https://www.ign.com/articles/elden-ring-shadow...,2025-03-30 15:20:29,Mitchell Saltzman,2024-06-18T14:00:00.000Z,2024-06-25T22:46:45.861Z,Elden Ring: Shadow of the Erdtree DLC Review,Untarnished.,10.0,FromSoftware says Shadow of the Erdtree is the...,"Like the base game did before it, Elden Ring: ...","When I gave Elden Ring a 10 two years ago, I d...","According to our count, there are more than 40...","When I gave Elden Ring a 10 two years ago , I ...",give elden ring year ago incredible game raise...
2,85dfd49f-59ce-45cb-8b86-657c001455de,https://www.ign.com/articles/stardew-valley-re...,2025-03-30 15:20:30,Shailyn Cotten,2024-05-07T16:00:00.000Z,2024-07-12T17:14:15.989Z,Stardew Valley Review - 2024,Eight years of impressive updates have grown S...,10.0,Stardew Valley is not only the best farming ga...,"More than just a cozy farming sim, eight years...",Editor's Note: This review takes a fresh look ...,Multitasking efficiently is a deceptively tens...,Editor 's Note : This review takes a fresh loo...,editor note review take fresh look stardew val...
3,dc47b511-7668-4107-b7c0-8c63266ac963,https://www.ign.com/articles/asgards-wrath-2-r...,2025-03-30 15:20:31,Travis Northup,2023-12-14T14:00:00.000Z,2023-12-14T20:40:01.659Z,Asgard’s Wrath 2 Review,This open-world action RPG sets a new gold sta...,10.0,Asgard’s Wrath 2 is the full-fledged VR open-w...,Asgard’s Wrath 2 is an open-world action RPG t...,For those who have been patiently waiting on t...,This sequel improves and expands upon just abo...,For those who have been patiently waiting on t...,patiently wait sideline vr reason couch flail ...
4,003f6d65-549f-477c-822b-6f57a8a9105e,https://www.ign.com/articles/baldurs-gate-3-re...,2025-03-30 15:20:32,Leana Hafer,2023-08-18T23:32:25.376Z,2024-01-31T19:38:28.947Z,Baldur's Gate 3 Review,A new high-water mark for CRPGs with satisfyin...,10.0,I don't want to say every CRPG going forward s...,"With crunchy, tactical RPG combat, a memorable...","Every once in a long while, a game comes along...","This is a world that will rarely tell you, 'No...","Every once in a long while , a game comes alon...",long game come memorable exciting fresh write ...


In [41]:
df.to_sql("reviews", conn, if_exists="replace", index=False)

6945