In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
!pip install transformers torch kaggle



In [4]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json
!echo '{"username":"elaklunder","key":"0ef13f8c58212064b76201f6a0deb398"}' > ~/.kaggle/kaggle.json
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# download dataset
!kaggle competitions download -c fcg-2023-game-recommendation

mkdir: /Users/mihkelmariuszjezierski/.kaggle: File exists
cp: kaggle.json: No such file or directory
fcg-2023-game-recommendation.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# create data folder
!mkdir data

# unzip dataset
!unzip -q fcg-2023-game-recommendation.zip -d data/

mkdir: data: File exists
replace data/game_metadata.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [6]:
import pandas as pd                 # pandas is a dataframe library
import numpy as np                  # numpy provides N-dim object support
import matplotlib.pyplot as plt     # matplotlib.pyplot plots data
import seaborn as sns               # seaborn is the big brother of matplotlib
import os                           # os handles directory/workspace changes
from tqdm import tqdm_notebook      # tqdm_notebook is the progress bar library

from sklearn.model_selection import train_test_split # to split out training and testing data
from sklearn.preprocessing import LabelEncoder       # to convert labels into numbers
from sklearn.metrics import mean_absolute_error      # for mean absolute error
from sklearn.preprocessing import MinMaxScaler       # for using a normalization scaler
from sklearn.metrics import fbeta_score              # for scoring the models

import tensorflow as tf             # tensorflow is the machine learning library we will be using

pd.set_option('display.max_columns', 100) # Display up to 100 columns of a dataframe
pd.set_option('display.max_rows', 100)    # Display up to 100 rows of a dataframe

In [7]:
game_data = pd.read_csv("./data/game_metadata.csv")
sample_submission = pd.read_csv("./data/sample_submission.csv")
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [8]:
game_data.head()

Unnamed: 0,name,appid,required_age,is_free,controller_support,detailed_description,about_the_game,short_description,supported_languages,reviews,pc_requirements,mac_requirements,linux_requirements,developers,publishers,price_overview,platforms,categories,genres,recommendations,achievements,release_date,content_descriptors,metacritic,ext_user_account_notice,drm_notice
0,Nova Drift,858210,0,False,full,<h1>JOIN THE DISCORD COMMUNITY</h1><p><a href=...,"<h2 class=""bb_tag""><strong>- SURVIVE - EVOLVE ...",Nova Drift distills the mechanical depth and s...,"English, French, Russian, Simplified Chinese, ...",“Its integration of rogue-lite unlocks and a r...,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,[],['Chimeric'],['Pixeljam'],"{'currency': 'EUR', 'initial': 1699, 'final': ...","{'windows': True, 'mac': True, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...",{'total': 9354},"{'total': 40, 'highlighted': [{'name': 'Planet...","{'coming_soon': False, 'date': '27 Mar, 2019'}","{'ids': [], 'notes': None}",,,
1,Sid Meier's Civilization® V,8930,0,False,,The Flagship Turn-Based Strategy Game Returns<...,The Flagship Turn-Based Strategy Game Returns<...,"Create, discover, and download new player-crea...","English<strong>*</strong>, French<strong>*</st...",,{'minimum': '<strong>Minimum:</strong><br>\t\t...,{'minimum': '<strong>Minimum:</strong><br>\t\t...,"{'minimum': ""<strong>Operating System:</strong...","['Firaxis Games', 'Aspyr (Mac)', 'Aspyr (Linux)']","['2K', 'Aspyr (Mac)', 'Aspyr (Linux)']","{'currency': 'EUR', 'initial': 2999, 'final': ...","{'windows': True, 'mac': True, 'linux': True}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '2', 'description': 'Strategy'}]",{'total': 118286},"{'total': 286, 'highlighted': [{'name': 'First...","{'coming_soon': False, 'date': '23 Sep, 2010'}","{'ids': [], 'notes': None}","{'score': 90, 'url': 'https://www.metacritic.c...",,
2,The Incredible Adventures of Van Helsing II,272470,0,False,full,It all seemed so simple: defeat the mad scient...,It all seemed so simple: defeat the mad scient...,In the second part of the unfolding saga that ...,"English<strong>*</strong>, French, Italian, Ge...",,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,[],['NeocoreGames'],['NeocoreGames'],"{'currency': 'EUR', 'initial': 1499, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...",{'total': 1748},"{'total': 63, 'highlighted': [{'name': 'Tough ...","{'coming_soon': False, 'date': '13 Jun, 2014'}","{'ids': [], 'notes': None}","{'score': 77, 'url': 'https://www.metacritic.c...",,
3,ABZU,384190,0,False,full,"From the art director of Journey® and Flower®,...","From the art director of Journey® and Flower®,...","From the art director of Journey®, ABZÛ is a b...","English, French, Italian, German, Spanish - Sp...","“When I die, I hope whatever happens next is e...",{'minimum': '<strong>Minimum:</strong><br><ul ...,[],[],['Giant Squid'],['505 Games'],"{'currency': 'EUR', 'initial': 1999, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...",{'total': 18914},"{'total': 12, 'highlighted': [{'name': 'Breach...","{'coming_soon': False, 'date': '2 Aug, 2016'}","{'ids': [], 'notes': None}","{'score': 83, 'url': 'https://www.metacritic.c...",,
4,Assassin's Creed® Revelations,201870,18,False,,<h1>Ubisoft Online Services Decommission Updat...,When a man has won all his battles and defeate...,Ezio Auditore walks in the footsteps of the le...,"Danish, Dutch, English, French, German, Italia...",,{'minimum': '<strong>Minimum:</strong><br>\t\t...,[],[],['Ubisoft Montreal'],['Ubisoft'],"{'currency': 'EUR', 'initial': 1499, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...",{'total': 13513},,"{'coming_soon': False, 'date': '30 Nov, 2011'}","{'ids': [], 'notes': None}","{'score': 80, 'url': 'https://www.metacritic.c...",Uplay (Supports Linking to Steam Account),


In [9]:
# Remove the column that we do not use for the training

train.drop(columns=[
    "unix_timestamp_created",
    "unix_timestamp_updated",
    "votes_up",
    "votes_funny"

], inplace=True)

train.head()

Unnamed: 0,id,steamid,appid,voted_up,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review
0,0,76561198176882086,667720,1,0.0,1268.0,1268.0,110.0,52.0,Guerilla will appeal to anybody who plays a sh...
1,1,76561197996568962,442780,1,0.44174,907.0,299.0,145.0,37.0,All the people leaving negative reviews on thi...
2,2,76561198799216272,535930,1,0.0,2011.0,954.0,55.0,33.0,89
3,3,76561198042523735,239030,1,0.491706,783.0,179.0,445.0,38.0,Never had this much fun with bureaucracy.
4,4,76561198346512992,489830,1,0.0,11627.0,8981.0,122.0,15.0,This is a beautiful living-breathing world tha...


In [10]:
#game_data = pd.read_csv("./data/game_metadata.csv")
useful_columns = ['name', 'appid', 'required_age', 'is_free', 'controller_support', 'short_description', 'detailed_description','supported_languages','recommendations','genres', 'reviews','price_overview', 'platforms', 'categories', 'metacritic']
game_data = game_data[useful_columns]
game_data.columns

Index(['name', 'appid', 'required_age', 'is_free', 'controller_support',
       'short_description', 'detailed_description', 'supported_languages',
       'recommendations', 'genres', 'reviews', 'price_overview', 'platforms',
       'categories', 'metacritic'],
      dtype='object')

In [11]:
# Fuckery to get right values into the right form

import ast

def extract_value(s, key_or_ids):
    try:
        # Safely evaluate the string
        parsed_data = ast.literal_eval(s)

        if isinstance(parsed_data, dict):
            # If it's a dictionary, check if the key exists
            return parsed_data.get(key_or_ids, None)
        elif isinstance(parsed_data, list) and all(isinstance(item, dict) for item in parsed_data):
            # If it's a list of dictionaries, check if each dictionary has the specified key
            values = [item.get(key_or_ids, None) for item in parsed_data]
            # If all values are None, return None; otherwise, return the list of values
            return values if any(values) else None
        else:
            return None
    except (SyntaxError, ValueError):
        return None  # Return None if the string is not a valid dictionary or list of dictionaries


# Apply the function to the DataFrame columns
game_data.metacritic = game_data.metacritic.apply(lambda x: extract_value(x, 'score'))
game_data.categories = game_data.categories.apply(lambda x: extract_value(x, 'id'))
game_data.genres = game_data.genres.apply(lambda x: extract_value(x, 'id'))
game_data.price_overview = game_data.price_overview.apply(lambda x: extract_value(x, 'final'))
game_data.recommendations = game_data.recommendations.apply(lambda x: extract_value(x, 'total'))

In [12]:
#one hot encoding shit

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
game_data['genres'] = game_data['genres'].apply(lambda x: [] if x is None else x)
game_data['categories'] = game_data['categories'].apply(lambda x: [] if x is None else x)
game_data['controller_support'] = game_data['controller_support'].apply(lambda x: 0 if pd.isna(x) else 1)
game_data['is_free'] = game_data['is_free'].map({'True': 1, 'False': 0})


game_data['required_age'] = game_data['required_age'].apply(lambda x: 0 if (x == '17+' or x == 'required_age') else x)
game_data['required_age'] = game_data['required_age'].apply(lambda x: 0 if int(x) < 18 else 1)

# Fit and transform the 'categories' column
genres_encoded = pd.DataFrame(mlb.fit_transform(game_data['genres']), columns=mlb.classes_, index=game_data.index)
categories_encoded = pd.DataFrame(mlb.fit_transform(game_data['categories']), columns=mlb.classes_, index=game_data.index)

# Add prefixes to column names
genres_encoded = genres_encoded.add_prefix('Genre')
categories_encoded = categories_encoded.add_prefix('Category')

# Concatenate the one-hot encoded DataFrame with the original DataFrame
game_data = pd.concat([game_data, categories_encoded, genres_encoded], axis=1)


game_data['platforms'] = game_data['platforms'].apply(lambda x: {} if x is None else x)

# Extract 'windows', 'mac', and 'linux' values and handle None values
game_data['windows'] = game_data['platforms'].apply(lambda x: extract_value(x, 'windows')).fillna(False).astype(int)
game_data['mac'] = game_data['platforms'].apply(lambda x: extract_value(x, 'mac')).fillna(False).astype(int)
game_data['linux'] = game_data['platforms'].apply(lambda x: extract_value(x, 'linux')).fillna(False).astype(int)

# Drop the original columns
game_data = game_data.drop('platforms', axis=1)
game_data = game_data.drop('genres', axis=1)
game_data = game_data.drop('categories', axis=1)

In [13]:
# Fixing the differnce in format of the ids in the two datasets
game_data = game_data.drop(2074) #weird row
game_data = game_data.reset_index(drop=True) # resetting the indexes
game_data = game_data[game_data['appid'].str.isnumeric()]
game_data['appid'] = game_data['appid'].astype('int64')

In [14]:
game_data = game_data[game_data['appid'] != 704230]

In [15]:
# Encode the appid's and steamid's to be from 0 to n-1

user_enc = LabelEncoder()
train["steamid"] = user_enc.fit_transform(train["steamid"].values)

app_enc = LabelEncoder()
train["appid"] = app_enc.fit_transform(train["appid"].values)

test["steamid"] = user_enc.transform(test["steamid"].values)
test["appid"] = app_enc.transform(test["appid"].values)

game_data["appid"] = app_enc.transform(game_data["appid"].values)

In [16]:
prev_size = train.shape[0]

min = 6

while True:
    # get list of users who have rated less than 10 games
    tiny_gamers = train.steamid.value_counts() < min
    tiny_gamers = tiny_gamers[tiny_gamers == True].index

    # get list of animes which have been rated less than 10 times
    tiny_games = train.appid.value_counts() < min
    tiny_games = tiny_games[tiny_games == True].index

    # remove them
    train = train[~train.steamid.isin(tiny_gamers)]
    train = train[~train.appid.isin(tiny_games)]

    if train.shape[0] == prev_size:
        break
    else:
        prev_size = train.shape[0]

now = train.shape[0]
print(f"removed {prev_size-now}")

removed 0


In [17]:
# deleting the games that only appear in the game_data
unique_game_data_appids = set(game_data['appid'].unique())
unique_train_appids = set(train['appid'].unique())
common_ids = unique_game_data_appids.intersection(unique_train_appids)

print(len(unique_train_appids -unique_game_data_appids))
print(len(common_ids))

4259
3874


In [18]:
train = train[train['appid'].isin(common_ids)]
unique_train_appids = set(train['appid'].unique())
print(len(unique_train_appids))

3874


In [19]:
# Filter out classes with only one member
counts = train['steamid'].value_counts()
valid_classes = counts[counts > 1].index
train_filtered = train[train['steamid'].isin(valid_classes)]
print(f"removed {len(train) - len(train_filtered)}")
# Perform the split on the filtered data
train, eval = train_test_split(train_filtered, test_size=0.2, random_state=42, stratify=train_filtered['steamid'])


removed 149


In [20]:
columns_to_scale = ["playtime_forever", "playtime_at_review", "num_games_owned", "num_reviews"]

# Extract columns to scale from the DataFrame
columns_scaled = train[columns_to_scale]

# Scale the extracted columns
scaler = MinMaxScaler()
scaled_columns = scaler.fit_transform(columns_scaled)

# Create a DataFrame with the scaled columns
train_scaled = pd.DataFrame(scaled_columns, columns=columns_to_scale)

# Replace the original columns with the scaled ones in the original DataFrame
train[columns_to_scale] = train_scaled

In [21]:
results = pd.DataFrame(columns=['Model', 'f0.5_score'])

def add_res(model_name, score):
    global results
    results = pd.concat([results, pd.DataFrame([[model_name, score]], columns=['Model', 'f0.5_score'])])
    display(results.sort_values(by=['f0.5_score']))

In [22]:
from transformers import BertTokenizer, BertModel
import torch

def bert_embed_parallel(texts, batch_size=128):
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load pre-trained model
    model = BertModel.from_pretrained('bert-base-uncased')

    # Process texts in batches
    all_embeddings = []
    for i in tqdm_notebook(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]

        # Tokenize and pad the batch
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

        # Get embeddings with no gradient computation
        with torch.no_grad():
            outputs = model(**inputs)

        # Compute the mean of the last hidden states for each input in the batch
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        all_embeddings.extend(embeddings)

    return np.array(all_embeddings)

In [23]:
game_data.detailed_description.values[0]

'<h1>JOIN THE DISCORD COMMUNITY</h1><p><a href="https://steamcommunity.com/linkfilter/?u=https%3A%2F%2Fdiscord.gg%2FZp2MkPJ" target="_blank" rel=" noopener"  >Click here to join us on Discord!</a> Nova Drift is a very community-driven project.  We take all feedback seriously and a lot of the features in the works are from player requests.</p><br><h1>JOIN THE LOCALIZATION EFFORT</h1><p><a href="https://steamcommunity.com/linkfilter/?u=https%3A%2F%2Fwww.localizor.io%2Fgame%2F15%2Ftranslations" target="_blank" rel=" noopener"  >Click here to help us localize Nova Drift.</a> We\'re using <strong>Localizor</strong> to help the community translate the game into as many languages as possible.</p><br><h1>About the Game</h1><h2 class="bb_tag"><strong>- SURVIVE - EVOLVE - DOMINATE -</strong></h2><br><strong>Nova Drift</strong> is a \'rogue-lite\' space shooter that melds a classic arcade experience with modern action-RPG elements like theory-crafting and deep player choice. Your ship rapidly evo

In [24]:
from bs4 import BeautifulSoup

def clean_html(html_text):
    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(html_text, 'html.parser')

    # Insert spaces between all tags
    for tag in soup.find_all():
        if tag.string:
            tag.string += ' '

    # Get text and remove extra spaces, tabs, and newlines
    text = soup.get_text()
    text = ' '.join(text.split())  # Remove extra spaces

    return text


In [25]:
game_data['detailed_description']=game_data['detailed_description'].apply(lambda x:clean_html(x))

In [26]:
import re
contraction_dict = {"ain't": "are not", "'s":" ","´s": " ", "’s": " ", "aren't": "are not", "-": " ", "–": " ", "t's": "it is"}
# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contraction_dict.keys()))
def expand_contractions(text,contractions_dict=contraction_dict):
    def replace(match):
        return contraction_dict[match.group(0)]
    return contractions_re.sub(replace, text)
# Expanding Contractions in the reviews
game_data['detailed_description']=game_data['detailed_description'].apply(lambda x:expand_contractions(x))

In [27]:
game_data['detailed_description'] = game_data['detailed_description'].str.lower()

In [28]:
#remove punctuation
import string

game_data['detailed_description'] = game_data['detailed_description'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))

In [29]:
#remove words and digits
game_data['detailed_description'] = game_data['detailed_description'].apply(lambda x: re.sub('W*dw*','',x))

In [31]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
game_data['detailed_description'] = game_data['detailed_description'].apply(lambda x: stem_words(x))

In [32]:
game_data['detailed_description'] = game_data['detailed_description'].apply(lambda text: re.sub(' +', ' ', text))

In [33]:
game_data.detailed_description.values[0]

'join the iscor commun click here to join us on iscor nova rift is a veri commun riven project we take all feeback serious an a lot of the featur in the work are from player requestsjoin the local effort click here to help us local nova rift were use localizor to help the commun translat the game into as mani languag as possibleabout the game surviv evolv omin nova rift is a rogu lite space shooter that mel a classic arca experi with moern action rpg element like theori craft an eep player choic your ship rapili evolv as you efeat enemi allow you to shape it abil an weaponri to your esir in a matter of minut unlik mani arpg which can absorb hunr of hour of time nova rift allow you to take a ship buil from incept to execut in a singl game sessionfeatur creat a uniqu buil everi run game are quick allow for rapi experiment iter an avanc cun an creativ are rewar as you chain upgra moul for power synergi reminisc of eck buil game access over 200 moular upgra as you progress an expan your ar

In [34]:
# Example usage
texts = list(game_data.detailed_description.values)
embedded_texts = bert_embed_parallel(texts)

# save the embeddings
np.save('bert_embeddings.npy', embedded_texts)

  0%|          | 0/31 [00:00<?, ?it/s]

In [80]:
embedded_texts

array([[-0.5446055 , -0.2098972 ,  0.4673068 , ..., -0.20100197,
         0.08008831,  0.09644759],
       [-0.4219265 , -0.17000866,  0.5201603 , ..., -0.23345667,
         0.01888255,  0.15806876],
       [-0.42767268, -0.2635079 ,  0.5124908 , ..., -0.12442493,
         0.07272997,  0.03968874],
       ...,
       [-0.17478214, -0.0884595 ,  0.26388294, ..., -0.07933448,
        -0.02315191,  0.01948774],
       [-0.41963318, -0.25054717,  0.5249109 , ...,  0.00641501,
         0.05858753, -0.03804968],
       [-0.23880696, -0.10534985,  0.3491846 , ..., -0.03476592,
        -0.01631088,  0.05275644]], dtype=float32)

In [82]:
train.head()

Unnamed: 0,id,steamid,appid,voted_up,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review
1268226,1268226,174777,1370,1,0.0,0.00077,0.000877,0.010908,0.010315,7/10
2198689,2198689,130484,3600,1,0.482375,,,,,yes
1728340,1728340,78261,1221,0,0.0,,,,,"Was going to buy all dlcs during crime fest, b..."
2242038,2242038,172812,7110,1,0.52381,,,,,i brought SOTTER Digital Croft Edition.!\nThe ...
87299,87299,145419,84,1,0.52381,0.00055,0.000262,0.01777,0.002292,ode to joy will forever be the peggle song in ...


In [84]:
# ––
# THIS IS MOST INEFFICIENT WAY TO DO THIS
# THINK MAYBE HOW TO PARALLELIZE THIS
# BY CHANGING THE FOR LOOP TO LINEAR ALGEBRA
# ––


class BertSimRating:
    def __init__(self):
        self.user_emb = {}
        self.user_mean = None
        self.default_rating = None
        self.appid_mean = None  

    def fit(self, X, y):
        print("Calculating user embeddings...")
        # Calculate default rating
        global_zeros = np.sum(y==0)
        global_ones = np.sum(y==1)
        self.default_rating = 1 if global_ones >= global_zeros else 0

        self.user_mean = X.groupby('steamid')['voted_up'].mean()
        self.appid_mean = X.groupby('appid')['voted_up'].mean()
        for steamid, group in tqdm_notebook(X.groupby('steamid')):
            selection = game_data.appid.isin(group.appid)
            if selection.sum() == 0:
                continue
            emb = embedded_texts[selection].mean(axis=0)
            self.user_emb[steamid] = emb / np.linalg.norm(emb)

    def predict(self, X):
        print("Predicting...")
        y_pred = []
        count = 0
        
        for steamid, appid in tqdm_notebook(zip(X.steamid, X.appid), total=X.shape[0]):
            if steamid in self.user_emb and appid in game_data.appid:
                user_emb = self.user_emb[steamid]
                game_emb = embedded_texts[appid]
                cos_sim = np.dot(user_emb, game_emb) / np.linalg.norm(game_emb)
                y_pred.append(cos_sim)
            else:
                y_pred.append(self.user_mean.get(steamid, self.appid_mean.get(appid, self.default_rating))) # if there is an unseen value get the avg vote for that game if cant get that either get random int
                count = count +1
            
        y_pred_binary = np.array([1 if score > 0.5 else 0 for score in y_pred])
        print(f"used the fallback {count} times")
        print(f"n of actually handled {len(X) - count}")
        print(f"% of data predicted on genres{count/len(X)}")

        return np.array(y_pred_binary)

In [85]:
bert_sim_rating = BertSimRating()
bert_sim_rating.fit(train, train.voted_up)

Calculating user embeddings...


  0%|          | 0/176241 [00:00<?, ?it/s]

In [86]:

y_pred = bert_sim_rating.predict(eval)
mae = mean_absolute_error(eval.voted_up, y_pred)
add_res('BERT Similiraty Rating', mae)

Predicting...


  0%|          | 0/357102 [00:00<?, ?it/s]

used the fallback 118095 times
n of actually handled 239007
% of data predicted on genres0.3307038325184401


Unnamed: 0,Model,f0.5_score
0,BERT Similiraty Rating,0.166734
0,BERT Similiraty Rating,0.169181
0,BERT Similiraty Rating,0.169181
0,BERT Similiraty Rating,0.169181
0,BERT Similiraty Rating,0.226274
0,BERT Similiraty Rating,0.226274


In [88]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [89]:
output_useravg = pd.DataFrame({'id': eval['id'], 'voted_up': y_pred})
output_useravg.to_csv('bert_model_eval.csv', index = False)

In [70]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [71]:
eval.voted_up

586840     1
1998563    1
1415215    1
464540     1
2242781    1
          ..
1982957    1
1451894    1
649497     1
395158     0
2513149    1
Name: voted_up, Length: 357102, dtype: int64

In [40]:
wadeva = y_pred
print(wadeva)

[0.95640105 1.         0.95964688 ... 0.97580111 0.93324137 0.85714286]


In [47]:
for entry in range(0, len(wadeva)):
    if wadeva[entry] > 0.5:
        wadeva[entry] = 1
    else:
        wadeva[entry] = 0


print(wadeva)
print(type(wadeva))

[1. 1. 1. ... 1. 1. 1.]
<class 'numpy.ndarray'>


In [46]:
mae = mean_absolute_error(eval.voted_up, wadeva)

add_res('BERT Similiraty Rating', mae)

Unnamed: 0,Model,f0.5_score
0,BERT Similiraty Rating,0.169181
0,BERT Similiraty Rating,0.226274
0,BERT Similiraty Rating,0.226274


In [68]:
y_pred_2 = bert_sim_rating.predict(test)


Predicting...


  0%|          | 0/657469 [00:00<?, ?it/s]

used the fallback 227185 times
n of actually handled 430284
% of data predicted on genres0.3455448089567721


In [73]:
output_useravg = pd.DataFrame({'id': test['id'], 'voted_up': y_pred_2})
output_useravg.to_csv('bert_model.csv', index = False)

In [69]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])