# Imports

In [1]:
import pandas as pd
import numpy as np
from recommender import load_data

# Loading & Inspecting Data

In [23]:
# load
df = load_data()

# get first 5
pd.set_option('display.max_columns', None)
print("df.head()=================================================")
print(df.head())

# get data structure
print("df.dtypes=================================================")
print(df.dtypes)

# get num of nulls
print("df.isnul().sum()=================================================")
print(df.isnull().sum())

# get num of uniques
print("df.nunique()=================================================")
print(df.nunique())

[INFO] CSV already exists at c:\Users\mensu\Documents\Last Sem\CS506\projects\CS506_FinalProject\new-game-plus-backend\newgameplus\data\games.csv
    appid                             name release_date  required_age  price  \
0     730                 Counter-Strike 2   2012-08-21             0   0.00   
1  578080              PUBG: BATTLEGROUNDS   2017-12-21             0   0.00   
2     570                           Dota 2   2013-07-09             0   0.00   
3  271590        Grand Theft Auto V Legacy   2015-04-13            17   0.00   
4  488824  Tom Clancy's Rainbow Six® Siege   2015-12-01            17  19.99   

   dlc_count                               detailed_description  \
0          1  For over two decades, Counter-Strike has offer...   
1          0  LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...   
2          2  The most-played game on Steam. Every day, mill...   
3          0  When a young street hustler, a retired bank ro...   
4          9  Edition Comparison Ultim

# Preprocessing

In [24]:
# remove the null name rows
df = df.drop(df.loc[df['name'].isna()].index)

# drop USELESS columns
df = df.drop(['name','tags','reviews', 'appid', 'detailed_description', 'about_the_game', 'short_description', 'header_image', 'website', 'support_url','support_email','metacritic_url','notes', 'packages', 'developers', 'publishers','screenshots', 'movies','user_score','score_rank','estimated_owners','positive','negative'], axis=1) # edw: undrop reviews and sentiment thing

In [25]:
# clean data
df['release_date_cleaned'] = pd.to_datetime(df['release_date'], errors='coerce')
df = df.drop(['release_date'], axis=1)




In [26]:
#Idetnfying the unique supported_languages
import ast 
import re

#print(type(df['supported_languages'].iloc[0]))

def clean_language(lang):
    # Remove leading/trailing whitespace and newlines
    lang = lang.strip()
    # Remove HTML-like tags (like [b][/b])
    lang = re.sub(r'\[/?b\]', '', lang)
    # Strip again after removing tags
    lang = lang.strip()
    return lang


df['supported_languages'] = df['supported_languages'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x) 

unique_supported_languages = set()

for langs in df['supported_languages']:  # Replace with your actual column name
    if isinstance(langs, list):
        for lang in langs:
            # Some entries may still have multiple langs joined with commas
            for l in lang.split(','):
                cleaned = clean_language(l)
                unique_supported_languages.add(cleaned)

unique_supported_languages = list(unique_supported_languages)
#print("Unique supported languages", unique_supported_languages)


In [27]:
#One hot encoding the languages

lang_columns = {}

for lang in unique_supported_languages:
    lang_columns[lang+"_supported_languages"] = df['supported_languages'].apply(lambda x: 1 if lang in x else 0)

lang_df = pd.DataFrame(lang_columns)

df = pd.concat([df, lang_df], axis=1)

df.drop("supported_languages", axis = 1, inplace = True)

#print(df.head())

In [28]:
#identify the unique supported audios
df['full_audio_languages'] = df['full_audio_languages'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x) 

unique_full_audio_languages = set()

for langs in df['full_audio_languages']:  # Replace with your actual column name
    if isinstance(langs, list):
        for lang in langs:
            # Some entries may still have multiple langs joined with commas
            for l in lang.split(','):
                cleaned = clean_language(l)
                unique_full_audio_languages.add(cleaned)

unique_full_audio_languages = list(unique_full_audio_languages)
#print("Unique Full Audio Languages:", unique_full_audio_languages)

In [29]:
#one hot encoding full audio languages

audio_lang_columns = {}

for audio_lang in unique_full_audio_languages:
    audio_lang_columns[audio_lang+"_full_audio_languages"] = df['full_audio_languages'].apply(lambda x: 1 if lang in x else 0)

audio_lang_df = pd.DataFrame(audio_lang_columns)

df = pd.concat([df, audio_lang_df], axis=1)

df.drop("full_audio_languages", axis=1, inplace=True)


#print(df.head())

In [30]:
#Identifying the unique genres
import ast
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

unique_genres = set()

for genres in df['genres']:
    if isinstance(genres, list):
        unique_genres.update(genres)

unique_genre_list = list(unique_genres)
#print("Unique genres:", unique_genre_list)

In [31]:
#One hot encoding the genres
for genre in unique_genres:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)

df.drop("genres", axis=1, inplace=True)


#print(df.head())

In [32]:
#identifying unique categories

import ast
df['categories'] = df['categories'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

unique_categories = set()

for categories in df['categories']:
    if isinstance(categories, list):
        unique_categories.update(categories)

unique_categories_list = list(unique_categories)
#print("Unique categories:", unique_categories_list)

In [33]:
#One hot encoding the genres

for categories in unique_categories:
    df[categories] = df['categories'].apply(lambda x: 1 if categories in x else 0)

df.drop("categories", axis=1, inplace=True)

#print(df.head())

In [34]:
# convert boolean into numerical
df['windows'] = df['windows'].astype(int)
df['mac'] = df['mac'].astype(int)
df['linux'] = df['linux'].astype(int)


In [35]:
pd.set_option('display.max_rows', None)
df.dtypes

required_age                                             int64
price                                                  float64
dlc_count                                                int64
windows                                                  int64
mac                                                      int64
linux                                                    int64
metacritic_score                                         int64
achievements                                             int64
recommendations                                          int64
average_playtime_forever                                 int64
average_playtime_2weeks                                  int64
median_playtime_forever                                  int64
median_playtime_2weeks                                   int64
discount                                                 int64
peak_ccu                                                 int64
pct_pos_total                                          