# Fine-tuned ALBERT Model for Constructiveness Detection in Steam Reviews
## *Sentiment-Analysis of Videogame Reviews on the Platform ”Steam” with a Focus on the Detection and Classification of <b>Constructiveness</b>*
---
### <u>NOTEBOOK **1**/5</u>: This Notebook handles the initial filtering of the raw steam review data, by removing unneeded columns and reducing the overall dataset size.

In [None]:
# Package Installations
!pip install pandas
!pip install numpy
!pip install kaggle
!pip install tqdm
!pip install psutil

In [None]:
# Imports
import pandas as pd
import numpy as np
import json
import os
from tqdm import tqdm
import psutil
import gc

# from google.colab import files
# Uploading the kaggle.json to configure the Kaggle API
# kaggle_json = files.upload()

In [None]:
# Local Runtime with PC at home
# Loads kaggle.json from local Downloads folder
if os.path.exists("./Downloads/kaggle.json"):
  kaggle_json_path = "./Downloads/kaggle.json"
  with open(kaggle_json_path, "r") as f:
      kaggle_json = json.load(f)
      print(kaggle_json)
else:
  print("Kaggle.json not found in Downloads Folder")

In [None]:
# Google Colab Runtime Attempt (Ran out of Memory)
# Creates a kaggle directory and move the kaggle.json file there
#!mkdir -p /content/kaggle/kaggle_api_config
#!mv kaggle.json /content/kaggle/kaggle_api_config

# Sets permissions for the kaggle.json file
#!chmod 600 /content/kaggle/kaggle_api_config/kaggle.json

In [None]:
# Creates a dataset directory
#!mkdir -p /content/kaggle/downloaded_dataset/

# Downloads the Steam Review Dataset using the Kaggle API
#!kaggle datasets download -d kieranpoc/steam-reviews -p /content/kaggle/downloaded_dataset/

In [None]:
# Unzipping the dataset
#!unzip /content/kaggle/downloaded_dataset/steam_reviews.zip -d /content/kaggle/downloaded_dataset/steam_reviews
# Deleting the zipped dataset
#!rm /content/kaggle/downloaded_dataset/steam_reviews.zip

In [None]:
# Data Preprocessing
# df = pandas.read_csv("/content/kaggle/downloaded_dataset/steam_reviews/all_reviews/all_reviews.csv")

In [None]:
# Local
# Path to the kaggle.json file
kaggle_json_path = os.path.expanduser("~/Downloads/kaggle.json")

In [None]:
# Checks if ~/.kaggle directory exists
kaggle_dir = os.path.expanduser("~/.kaggle")
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)
    print("Created ~/.kaggle directory")
else:
    print("Found ~/.kaggle directory")

Found ~/.kaggle directory


In [None]:
# Moves kaggle.json to the ~/.kaggle/ directory
kaggle_json_dest = os.path.join(kaggle_dir, "kaggle.json")
if not os.path.exists(kaggle_json_dest):
    os.rename(kaggle_json_path, kaggle_json_dest)
    print("Moved kaggle.json to ~/.kaggle/ directory")
else:
    print("kaggle.json already exists in ~/.kaggle/ directory")

kaggle.json already exists in ~/.kaggle/ directory


In [None]:
os.chmod(kaggle_json_dest, 0o600)

In [None]:
with open(kaggle_json_dest, "r") as f:
    kaggle_json = json.load(f)
    print("Kaggle.json loaded successfully:", kaggle_json)

In [None]:
dataset_dir = "./kaggle/downloaded_dataset/"
os.makedirs(dataset_dir, exist_ok=True)

In [None]:
if not os.path.exists(f"{dataset_dir}/steam-reviews.zip"):
  os.system(f"kaggle datasets download -d kieranpoc/steam-reviews -p {dataset_dir}")
  print("Dataset downloaded successfully")
else:
  print("Dataset already present in ~/.kaggle/downloaded_dataset/ directory")

Dataset already present in ~/.kaggle/downloaded_dataset/ directory


In [None]:
if not os.path.exists(f"{dataset_dir}/unzipped_steam_reviews"):
  os.system(f"unzip {dataset_dir}/steam-reviews.zip -d {dataset_dir}/unzipped_steam_reviews")

In [None]:
# os.remove(f"{dataset_dir}/steam-reviews.zip")

In [None]:
csv_file_path = f"{dataset_dir}/unzipped_steam_reviews/all_reviews/all_reviews.csv"
if not os.path.exists(csv_file_path):
  print("CSV file not found")
else:
  print("CSV file found")

CSV file found


In [None]:
dtype_dict = {
    'recommendationid': 'object',
    'appid': 'int32',
    'game': 'object',
    'author_steamid': 'object',
    'author_num_games_owned': 'int16',
    'author_num_reviews': 'int16',
    'author_playtime_forever': 'float32',
    'author_playtime_last_two_weeks': 'float32',
    'author_playtime_at_review': 'float32',
    'author_last_played': 'float32',
    'language': 'object',
    'review': 'object',
    'timestamp_created': 'int64',
    'timestamp_updated': 'int64',
    'voted_up': 'bool',
    'votes_up': 'int32',
    'votes_funny': 'int32',
    'weighted_vote_score': 'float32',
    'comment_count': 'int32',
    'steam_purchase': 'bool',
    'received_for_free': 'bool',
    'written_during_early_access': 'bool',
    'hidden_in_steam_china': 'bool',
}

In [None]:
chunk_size = 10000
dfs = []

In [None]:
gc.collect()
columns_to_use = ["game", "author_playtime_at_review", "review", "voted_up", "votes_up", "votes_funny", "language"]

with tqdm(total=os.path.getsize(csv_file_path), unit='B', unit_scale=True, desc='Processing Steam Reviews') as pbar:
  for chunk in pd.read_csv(csv_file_path, chunksize=chunk_size, dtype=dtype_dict, usecols=columns_to_use):
      filtered_chunk = chunk[chunk["language"] == "english"][["game", "author_playtime_at_review", "review", "voted_up", "votes_up", "votes_funny"]]
      dfs.append(filtered_chunk)

      chunk_size_bytes = chunk.memory_usage(deep=True).sum()
      pbar.update(chunk_size_bytes)

      cpu_usage = psutil.cpu_percent(interval=0.1)  # CPU percentage
      memory_usage = psutil.virtual_memory().percent  # Memory percentage

      pbar.set_description(f"Processing | CPU: {cpu_usage}% | RAM: {memory_usage}%")

Processing | CPU: 9.6% | RAM: 76.6%: : 53.3GB [26:51, 33.1MB/s]


In [None]:
filtered_reviews_df = pd.concat(dfs)

In [None]:
# Shape pre-dropping
filtered_reviews_df.shape

(51544612, 6)

In [None]:
# Drops rows from the filtered df where a column is NaN and shows how many were dropped
filtered_reviews_df.dropna(inplace=True)
filtered_reviews_df.shape

(51541645, 6)

In [None]:
# Shape post-dropping
filtered_reviews_df.shape

(51541645, 6)

In [None]:
# Filtering out the most popular games (10 games with most rows)
popular_games = filtered_reviews_df["game"].value_counts().head(10).index
print(popular_games)
filtered_reviews_df = filtered_reviews_df[filtered_reviews_df["game"].isin(popular_games)]

Index(['Counter-Strike 2', 'Terraria', 'Team Fortress 2',
       'Tom Clancy's Rainbow Six Siege', 'Grand Theft Auto V', 'Rust',
       'Garry's Mod', 'Among Us', 'PUBG: BATTLEGROUNDS', 'ELDEN RING'],
      dtype='object', name='game')


In [None]:
# Removing duplicate reviews
filtered_reviews_df.drop_duplicates(subset=["review"], inplace=True)

In [None]:
# Removing uninformative short reviews
review_length_minimum_threshold = 2
filtered_reviews_df = filtered_reviews_df[filtered_reviews_df["review"].str.len() > review_length_minimum_threshold]

In [None]:
# Removing rows with suspiciously high playtime
# playtime_maximum_threshold = 40000
# filtered_reviews_df = filtered_reviews_df[filtered_reviews_df["author_playtime_at_review"] < playtime_maximum_threshold]

In [None]:
# Converts author_playtime_at_review which is in minutes into hours integer
filtered_reviews_df["author_playtime_at_review"] = (filtered_reviews_df["author_playtime_at_review"] / 60).astype(int)

In [None]:
# Randomly reduces the csv to 150 rows from each game, so 1500 rows in total.
filtered_reviews_df = filtered_reviews_df.groupby("game").apply(lambda x: x.sample(n=150, random_state=42)).reset_index(drop=True)

  filtered_reviews_df = filtered_reviews_df.groupby("game").apply(lambda x: x.sample(n=150, random_state=42)).reset_index(drop=True)


In [None]:
# Resets Index after dropping rows
filtered_reviews_df.reset_index(drop=True, inplace=True)

In [None]:
filtered_reviews_df.head()

Unnamed: 0,game,author_playtime_at_review,review,voted_up,votes_up,votes_funny
0,Among Us,6,This game can suck my balls before I play it a...,False,1,0
1,Among Us,11,Very fun little party game! Even better with f...,True,0,0
2,Among Us,40,if you're lonely don't bother but if you're no...,True,2,1
3,Among Us,80,fun and anoyying,True,0,0
4,Among Us,51,when impostor is sus...,True,0,0


In [None]:
filtered_reviews_df.describe

<bound method NDFrame.describe of                                 game  author_playtime_at_review  \
0                           Among Us                          6   
1                           Among Us                         11   
2                           Among Us                         40   
3                           Among Us                         80   
4                           Among Us                         51   
...                              ...                        ...   
1495  Tom Clancy's Rainbow Six Siege                        857   
1496  Tom Clancy's Rainbow Six Siege                          2   
1497  Tom Clancy's Rainbow Six Siege                         71   
1498  Tom Clancy's Rainbow Six Siege                         78   
1499  Tom Clancy's Rainbow Six Siege                        253   

                                                 review  voted_up  votes_up  \
0     This game can suck my balls before I play it a...     False         1   
1  

In [None]:
filtered_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game                       1500 non-null   object
 1   author_playtime_at_review  1500 non-null   int64 
 2   review                     1500 non-null   object
 3   voted_up                   1500 non-null   bool  
 4   votes_up                   1500 non-null   int32 
 5   votes_funny                1500 non-null   int32 
dtypes: bool(1), int32(2), int64(1), object(2)
memory usage: 48.5+ KB


In [None]:
filtered_reviews_df.nunique()

game                           10
author_playtime_at_review     543
review                       1500
voted_up                        2
votes_up                       19
votes_funny                    15
dtype: int64

In [None]:
filtered_reviews_df.columns

Index(['game', 'author_playtime_at_review', 'review', 'voted_up', 'votes_up',
       'votes_funny'],
      dtype='object')

In [None]:
# Saves the new filtered csv to the directory if it doesn't already exist
if not os.path.exists(f"{dataset_dir}/filtered_reviews.csv"):
  filtered_reviews_df.to_csv(f"{dataset_dir}/filtered_reviews.csv", index=False)