# Data Overview
## import games data

In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import polars as pl

from website import Base
from website.models import Game, Genre, Theme, Rating

In [2]:
database_uri = f'mysql+pymysql://root:root@localhost:3306/db'
engine = create_engine(database_uri)
Base.metadata.bind = engine
DBSession = sessionmaker(bind=engine)
session = DBSession()
games = session.query(Game).all()

In [3]:
def extract_name(items):
    return [x.name for x in items]

data = {
        'id': [g.id for g in games],
        'name': [g.name for g in games],
        'summary': [g.summary for g in games],
        'storyline': [g.storyline for g in games],
        'total_rating': [g.total_rating for g in games],
        'total_rating_count': [g.total_rating_count for g in games],
        'genres': [extract_name(g.genres) for g in games],
        'themes': [extract_name(g.themes) for g in games],
        'keywords': [extract_name(g.keywords) for g in games],
        'modes': [extract_name(g.modes) for g in games],
        'similar_games': [g.similar_games for g in games]
}

df = pl.DataFrame(data)
df.head()

id,name,summary,storyline,total_rating,total_rating_count,genres,themes,keywords,modes,similar_games
i64,str,str,str,i64,i64,list[str],list[str],list[str],list[str],list[i64]
1,"""Thief II: The …","""The ultimate t…","""The game begin…",88,124,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""thief"", … ""immersive""]","[""Single player""]","[2, 3, … 105049]"
2,"""Thief: The Dar…","""Thief is a fir…","""None""",78,149,"[""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""male protagonist""]","[""Single player""]","[1, 3, … 11118]"
3,"""Thief: Deadly …","""In the third i…","""None""",83,116,"[""Shooter"", ""Simulator"", ""Adventure""]","[""Action"", ""Fantasy"", ""Stealth""]","[""ghosts"", ""assassin"", … ""immersive""]","[""Single player""]","[1, 2, … 19441]"
4,"""Thief""","""There is a ris…","""Garrett, the M…",67,295,"[""Shooter"", ""Adventure""]","[""Action"", ""Stealth"", ""Sandbox""]","[""thief"", ""steampunk"", … ""male protagonist""]","[""Single player""]","[1, 533, … 19441]"
5,"""Baldur's Gate""","""Baldur's Gate …","""Candlekeep is …",86,295,"[""Role-playing (RPG)""]","[""Fantasy""]",[],"[""Single player"", ""Co-operative""]","[19127, 19404, … 113910]"


## clear nulls from description (storyline+summary)

In [16]:
cleared_df = df.with_columns([
    pl.col("summary").fill_null("").str.replace("no info|none|No info|None", ""),
    pl.col("storyline").fill_null("").str.replace("no info|none|No info|None", "")
])
cleared_df = cleared_df.with_columns(
    pl.concat_str(
        [
            pl.col('summary'),
            pl.col('storyline')
        ],
        separator=" "
    ).alias("description")
)
cleared_df = cleared_df.drop(["summary", "storyline"])
cleared_df = cleared_df.with_columns([
    pl.when((pl.col('description').str.len_chars() == 1) | pl.col('description').str.starts_with("None None"))
    .then(None)
    .otherwise(pl.col('description'))
    .alias("description")
])
print("before")
print(cleared_df.null_count())
cleared_df = cleared_df.drop_nulls()
print("after")
print(cleared_df.null_count())

before
shape: (1, 10)
┌─────┬──────┬──────────────┬─────────────────┬───┬──────────┬───────┬───────────────┬─────────────┐
│ id  ┆ name ┆ total_rating ┆ total_rating_co ┆ … ┆ keywords ┆ modes ┆ similar_games ┆ description │
│ --- ┆ ---  ┆ ---          ┆ unt             ┆   ┆ ---      ┆ ---   ┆ ---           ┆ ---         │
│ u32 ┆ u32  ┆ u32          ┆ ---             ┆   ┆ u32      ┆ u32   ┆ u32           ┆ u32         │
│     ┆      ┆              ┆ u32             ┆   ┆          ┆       ┆               ┆             │
╞═════╪══════╪══════════════╪═════════════════╪═══╪══════════╪═══════╪═══════════════╪═════════════╡
│ 0   ┆ 0    ┆ 0            ┆ 0               ┆ … ┆ 0        ┆ 0     ┆ 0             ┆ 397         │
└─────┴──────┴──────────────┴─────────────────┴───┴──────────┴───────┴───────────────┴─────────────┘
after
shape: (1, 10)
┌─────┬──────┬──────────────┬─────────────────┬───┬──────────┬───────┬───────────────┬─────────────┐
│ id  ┆ name ┆ total_rating ┆ total_rating_co ┆ 

In [17]:
shortest_description_row = cleared_df.filter(pl.col("description").str.len_chars() == cleared_df.select(pl.col("description").str.len_chars()).min())
print(f"Shortest Description: {shortest_description_row}")

Shortest Description: shape: (1, 10)
┌───────┬────────┬─────────────┬────────────┬───┬────────────┬───────────┬────────────┬────────────┐
│ id    ┆ name   ┆ total_ratin ┆ total_rati ┆ … ┆ keywords   ┆ modes     ┆ similar_ga ┆ descriptio │
│ ---   ┆ ---    ┆ g           ┆ ng_count   ┆   ┆ ---        ┆ ---       ┆ mes        ┆ n          │
│ i64   ┆ str    ┆ ---         ┆ ---        ┆   ┆ list[str]  ┆ list[str] ┆ ---        ┆ ---        │
│       ┆        ┆ i64         ┆ i64        ┆   ┆            ┆           ┆ list[i64]  ┆ str        │
╞═══════╪════════╪═════════════╪════════════╪═══╪════════════╪═══════════╪════════════╪════════════╡
│ 56284 ┆ Heart  ┆ 45          ┆ 1          ┆ … ┆ ["dating"] ┆ []        ┆ [25646,    ┆ 3DS game   │
│       ┆ Beaten ┆             ┆            ┆   ┆            ┆           ┆ 43367, …   ┆            │
│       ┆        ┆             ┆            ┆   ┆            ┆           ┆ 110503]    ┆            │
└───────┴────────┴─────────────┴────────────┴───┴─────

## clean and join features

In [19]:
def clean_and_join(row):
    return " ".join(sorted(["".join(filter(str.isalpha, i)) for i in row]))

features = ["genres", "themes", "keywords", "modes"]

cleared_df = cleared_df.with_columns([
        pl.col(features).map_elements(lambda row: clean_and_join(row))
])

cleared_df = cleared_df.with_columns(
     pl.concat_str(
        pl.col(features), separator=" "
    ).alias("features")
)
cleared_df

id,name,total_rating,total_rating_count,genres,themes,keywords,modes,similar_games,description,features
i64,str,i64,i64,str,str,str,str,list[i64],str,str
1,"""Thief II: The …",88,124,"""Adventure Shoo…","""Action Fantasy…","""actionadventur…","""Singleplayer""","[2, 3, … 105049]","""The ultimate t…","""Adventure Shoo…"
2,"""Thief: The Dar…",78,149,"""Adventure Simu…","""Action Fantasy…","""actionadventur…","""Singleplayer""","[1, 3, … 11118]","""Thief is a fir…","""Adventure Simu…"
3,"""Thief: Deadly …",83,116,"""Adventure Shoo…","""Action Fantasy…","""actionadventur…","""Singleplayer""","[1, 2, … 19441]","""In the third i…","""Adventure Shoo…"
4,"""Thief""",67,295,"""Adventure Shoo…","""Action Sandbox…","""achievements a…","""Singleplayer""","[1, 533, … 19441]","""There is a ris…","""Adventure Shoo…"
5,"""Baldur's Gate""",86,295,"""RoleplayingRPG…","""Fantasy""","""""","""Cooperative Si…","[19127, 19404, … 113910]","""Baldur's Gate …","""RoleplayingRPG…"
6,"""Baldur's Gate …",89,421,"""RoleplayingRPG…","""Fantasy Openwo…","""attributes bar…","""Cooperative Si…","[59, 214, … 8747]","""Every World ha…","""RoleplayingRPG…"
7,"""Jagged Allianc…",79,19,"""RoleplayingRPG…","""""","""daynightcycle …","""Singleplayer""","[7615, 8422, … 119171]","""Desperately yo…","""RoleplayingRPG…"
8,"""Jagged Allianc…",77,4,"""RoleplayingRPG…","""""","""ingameemail me…","""Multiplayer Si…","[7615, 8422, … 119171]","""The enemy is o…","""RoleplayingRPG…"
9,"""Jagged Allianc…",87,39,"""Adventure Role…","""Historical Sci…","""customization …","""Singleplayer""","[2365, 5652, … 119171]","""Jagged Allianc…","""Adventure Role…"
11,"""Vampire: The M…",79,525,"""RoleplayingRPG…","""Action Fantasy…","""ancientcurse a…","""Singleplayer""","[16, 41, … 9630]","""A first- and t…","""RoleplayingRPG…"


## calculate weighted average

In [7]:
C = cleared_df.select(pl.mean("total_rating")).item()
m = 31
print(f'C = {C}')
print(f'm = {m}')

C = 68.11497093023256
m = 31


In [8]:
#new_df = cleared_df.filter(pl.col("total_rating_count") >= m)
#print(df.select(pl.count()))
#print(new_df.select(pl.count()))

In [9]:
def calc_weighted_rating(v, R, m=m, C=C):
    return (v/(v+m) * R) + (m/(m+v) * C)

cleared_df = cleared_df.with_columns(
    calc_weighted_rating(pl.col("total_rating_count"),pl.col("total_rating")).alias("weighted_rating")
)

new_df = cleared_df.drop(["total_rating", "total_rating_count"]).sort(pl.col("weighted_rating"), descending=True)

## recommendation without AI

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

names = cleared_df["name"]
indexes = { x : i  for i, x in enumerate(names) }
print(names[0])
print(indexes["Mass Effect"])

In [11]:
def generate_feature_matrix(feature, tfidf=False):
    temp = cleared_df.select(pl.col(["name", feature])).rows_by_key(key=["name"]).items()
    features = [[x[0], x[1][0]] for x in temp]
    if tfidf:
        count = CountVectorizer(stop_words='english')
    else:
        count = TfidfVectorizer(stop_words='english')
    matrix = count.fit_transform(([x[1] for x in features]))
    if tfidf:
        return linear_kernel(matrix, matrix)
    else:
        return cosine_similarity(matrix, matrix)

sim_description = generate_feature_matrix("description", tfidf=True)
sim_genres = generate_feature_matrix("genres")
sim_themes = generate_feature_matrix("themes")
sim_keywords = generate_feature_matrix("keywords")

In [20]:
def get_recommendations(title, cosine_sim=sim_description):
    idx = indexes[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    results = [(i[0], i[1]) for i in sim_scores]
    return [(names[x], y) for x,y in results]

print(get_recommendations('Mass Effect', sim_description))
print(get_recommendations('Mass Effect', sim_genres))
print(get_recommendations('Mass Effect', sim_themes))
print(get_recommendations('Mass Effect', sim_keywords))

[('Star Wars: Republic Commando', 111.0), ('Battlefield: Bad Company 2', 110.0), ('Hyperdimension Neptunia Mk2', 110.0), ('Beyond Good & Evil', 103.0), ('Battlefield: Bad Company', 98.0), ('Sunset Overdrive', 94.0), ("Telltale Texas Hold'em", 94.0), ('Danmaku Unlimited 2', 85.0), ('The Pathless', 73.0), ('Mass Effect 2', 72.0)]
[('Fallout: New Vegas', 1.0), ('System Shock', 1.0), ('Deus Ex: Invisible War', 1.0), ('Mass Effect', 1.0), ('S.T.A.L.K.E.R.: Shadow of Chernobyl', 1.0), ('Hellgate: London', 1.0), ('Metro 2033', 1.0), ('Borderlands', 1.0), ('WildStar', 1.0), ('Warhammer 40,000: Space Marine', 1.0)]
[('Fingered', 0.9686166194514556), ('New York Mysteries: Secrets of the Mafia', 0.9686166194514556), ('Odysseus: Long Way Home', 0.9686166194514556), ('Ready 2 Rumble: Revolution', 0.9025182311151356), ('Driv3r', 0.9025182311151356), ('Crown and Council', 0.9025182311151356), ('The Sims 2: Apartment Pets', 0.9025182311151356), ('Die With Glory', 0.9025182311151356), ('Lucy Got Proble

In [21]:
short_df = cleared_df.drop(["genres", "themes", "keywords", "modes", "total_rating_count", "total_rating"])
short_df.head(5)

id,name,similar_games,description,features
i64,str,list[i64],str,str
1,"""Thief II: The …","[2, 3, … 105049]","""The ultimate t…","""Adventure Shoo…"
2,"""Thief: The Dar…","[1, 3, … 11118]","""Thief is a fir…","""Adventure Simu…"
3,"""Thief: Deadly …","[1, 2, … 19441]","""In the third i…","""Adventure Shoo…"
4,"""Thief""","[1, 533, … 19441]","""There is a ris…","""Adventure Shoo…"
5,"""Baldur's Gate""","[19127, 19404, … 113910]","""Baldur's Gate …","""RoleplayingRPG…"


# ~~connect with SI~~ 

In [13]:
genres = session.query(Genre).all()
genres = [g.name for g in genres]

themes = session.query(Theme).all()
themes = [t.name for t in themes]

data = ["".join(filter(str.isalpha, i)) for i in genres + themes]

data = {x:0 for x in data}
profile = pl.DataFrame(data)
profile

Pointandclick,Fighting,Shooter,Music,Platform,Puzzle,Racing,RealTimeStrategyRTS,RoleplayingRPG,Simulator,Sport,Strategy,TurnbasedstrategyTBS,Tactical,HackandslashBeatemup,QuizTrivia,Pinball,Adventure,Indie,Arcade,VisualNovel,CardBoardGame,MOBA,Action,Fantasy,Sciencefiction,Horror,Thriller,Survival,Historical,Stealth,Comedy,Business,Drama,Nonfiction,Sandbox,Educational,Kids,Openworld,Warfare,Party,Xexploreexpandexploitandexterminate,Erotic,Mystery,Romance
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
cleared_df = cleared_df.rename({"id": "game_id"})
cleared_df.head()

game_id,name,total_rating,total_rating_count,genres,themes,keywords,modes,similar_games,description,features,weighted_rating
i64,str,i64,i64,str,str,str,str,list[i64],str,str,f64
1,"""Thief II: The …",88,124,"""Shooter Simula…","""Action Fantasy…","""ghosts thief d…","""Singleplayer""","[2, 3, … 105049]","""The ultimate t…","""Shooter Simula…",84.022994
2,"""Thief: The Dar…",78,149,"""Simulator Adve…","""Action Fantasy…","""ghosts assassi…","""Singleplayer""","[1, 3, … 11118]","""Thief is a fir…","""Simulator Adve…",76.297578
3,"""Thief: Deadly …",83,116,"""Shooter Simula…","""Action Fantasy…","""ghosts assassi…","""Singleplayer""","[1, 2, … 19441]","""In the third i…","""Shooter Simula…",79.86098
4,"""Thief""",67,295,"""Shooter Advent…","""Action Stealth…","""thief steampun…","""Singleplayer""","[1, 533, … 19441]","""There is a ris…","""Shooter Advent…",67.106025
5,"""Baldur's Gate""",86,295,"""RoleplayingRPG…","""Fantasy""","""""","""Singleplayer C…","[19127, 19404, … 113910]","""Baldur's Gate …","""RoleplayingRPG…",84.299276


In [15]:
ratings = session.query(Rating).all()

data = {
    'id': [r.id for r in ratings],
    'rating': [r.rating for r in ratings],
    'user': [r.user_id for r in ratings],
    'game_id': [r.game.id for r in ratings],
}

ratings = pl.DataFrame(data)
ratings_df = ratings.join(cleared_df, on="game_id")
ratings_df.head()

ComputeError: datatypes of join keys don't match - `game_id`: null on left does not match `game_id`: i64 on right

In [None]:
# w_description, w_rating, w_genres, w_themes, w_keywords = 1, 1, 1, 1, 1
# 
# for genres in ratings_df["genres"]:
#     for genre in genres.split():
#         profile = profile.with_columns([pl.col(genre) + 1])
# for themes in ratings_df["themes"]:
#     for theme in themes.split():
#         profile = profile.with_columns([pl.col(theme) + 1])
# profile

In [None]:
for row in ratings_df.rows():
    rating = row[1]
    genres = row[7].split()
    themes = row[8].split()
    
    for genre in genres:
        profile = profile.with_columns([pl.col(genre) + rating])
    for theme in themes:
        profile = profile.with_columns([pl.col(theme) + rating])
        
profile