In [6]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

import os
os.chdir('/content/drive/MyDrive/major_project_cmpt_733')

Mounted at /content/drive


In [7]:
import pandas as pd

games_df = pd.read_csv('/content/drive/MyDrive/major_project_cmpt_733/finalData.csv', index_col=0)

print('Number of games loaded: %s ' % (len(games_df)), '\n')


# Display the data
games_df.head(-5)

Number of games loaded: 1273  



Unnamed: 0,Title,Link,Genre,Developer,Publisher,Released Date,Plots,Price,Rating
0,1-2-Switch,/wiki/1-2-Switch,Party,Nintendo EPD,Nintendo,"March 3, 2017",1-2-Switch is a party game in which players d...,37.971064,8.147500
1,10 Second Ninja X,/wiki/10_Second_Ninja_X,Action platformer,Four Circle Interactive,Thalamus Digital,"July 30, 2021",10 Second Ninja X is a sidescrolling puzzle p...,23.593198,7.333574
2,13 Sentinels Aegis Rim,/wiki/13_Sentinels_Aegis_Rim,Adventure,Vanillaware,Atlus,"April 12, 2022",13 Sentinels: Aegis Rim is a video game where...,39.990000,8.785000
3,140,/wiki/140_(video_game),Action,Carlsen Games,Carlsen Games,"January 9, 2020","As described by Carlsen, 140 is ""an old schoo...",23.593198,7.333574
4,198X,/wiki/198X,Arcade,Hi-Bit Studios,JP,"January 23, 2020",In an introductory sequence entitled Beating ...,24.576316,7.439789
...,...,...,...,...,...,...,...,...,...
1263,Ys Origin,/wiki/Ys_Origin,Action role-playing,Nihon Falcom,Nihon Falcom,"October 1, 2020",Ys Origin features three characters with vari...,17.990000,8.400000
1264,Ys IX Monstrum Nox,/wiki/Ys_IX_Monstrum_Nox,Action role-playing,Nihon Falcom,JP,"July 6, 2021",Ys IX: Monstrum Nox is an action role-playing...,24.576316,7.439789
1265,Yu-Gi-Oh! Master Duel,/wiki/Yu-Gi-Oh!_Master_Duel,Card battle,Konami,Konami,"January 18, 2022",and Structure The game is a direct translatio...,23.593198,7.330000
1266,Yu-Gi-Oh! Rush Duel Dawn of the Battle Royale,/wiki/Yu-Gi-Oh!_Rush_Duel_Saikyo_Battle_Royale,Card battle,Konami,Konami,"August 12, 2021",Games,15.992500,6.745000


Question-1 What are the top recommended games for the students that have similar plots of the games they have played previously?

In [8]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
import re
from nltk.stem.snowball import SnowballStemmer
#nltk.download('punkt')

# Create an English language SnowballStemmer object
stemmer = SnowballStemmer("english")

# Define a function to perform both stemming and tokenization
def tokenize_and_stem(text):
    tokens = []
    sentences = nltk.sent_tokenize(text)
    i = 0
    while i < len(sentences):
        words = nltk.word_tokenize(sentences[i])
        j = 0
        while j < len(words):
            tokens.append(words[j])
            j += 1
        i += 1

    i = 0
    filtered_tokens = []
    while i < len(tokens):
        if re.search('[a-zA-Z]', tokens[i]):
            filtered_tokens.append(tokens[i])
        i += 1

    stems = []
    i = 0
    while i < len(filtered_tokens):
        stems.append(stemmer.stem(filtered_tokens[i]))
        i += 1
        
    return stems


# Create a TfidfVectorizer object with parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=5000,
                                 min_df=0.3, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem,
                                 ngram_range = (1,3))



# Fit and transform the tfidf_vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in games_df["Plots"]])


# Get the game titles from the dataframe
game_titles = games_df['Title'].values

# Create an empty array to store the intra-list similarities
intra_list_similarities = np.zeros((len(game_titles),))

# Create an empty list to store the inter-list similarities
inter_list_similarities = []

# Loop through each game title
for i, title in enumerate(game_titles):
    # Compute the cosine similarity between the plot of the game title and the plots of all games in the dataframe
    cos_similarities = cosine_similarity(tfidf_matrix[i], tfidf_matrix)[0]

    # Get the indices of the top 5 similar games
    similar_games_indices = cos_similarities.argsort()[::-1][1:6]

    # Compute the intra-list similarity between the top 5 recommended games
    intra_list_similarities[i] = cosine_similarity(tfidf_matrix[similar_games_indices])[np.triu_indices(5, k=1)].mean()

    # Append the top 5 recommended games to the inter-list similarities list
    inter_list_similarities.append(similar_games_indices.tolist())

# Flatten the inter-list similarities list to a list of indices
inter_list_indices = list(itertools.chain.from_iterable(inter_list_similarities))

# Convert the tfidf_matrix and the mean of the selected rows to numpy arrays
tfidf_array = tfidf_matrix.toarray()
mean_array = tfidf_array[inter_list_indices].mean(axis=0)


# Compute the average intra-list similarity and the average inter-list similarity
avg_intra_list_similarity = intra_list_similarities.mean()
avg_inter_list_similarity = cosine_similarity(mean_array.reshape(1, -1), tfidf_array)[0].mean()

# Compute the ratio of intra-list similarity to inter-list similarity as a measure of accuracy
accuracy = avg_intra_list_similarity / avg_inter_list_similarity

print("Intra-list similarity:", avg_intra_list_similarity)
print("Inter-list similarity:", avg_inter_list_similarity)
print("Accuracy:", accuracy)

input_title = "Ys Origin"

# Compute the cosine similarity between the input game and all other games in the dataframe
input_index = np.where(game_titles == input_title)[0][0]
input_cos_similarities = cosine_similarity(tfidf_matrix[input_index], tfidf_matrix)[0]

# Get the indices of the top 5 similar games
similar_games_indices = input_cos_similarities.argsort()[::-1][1:6]

# Print the recommended game titles
print("Recommended games for", input_title + ":\n")
for i in range(len(similar_games_indices)):
    print(str(i+1) + ". " + game_titles[similar_games_indices[i]])



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Intra-list similarity: 0.7223064456125664
Inter-list similarity: 0.5363798361565234
Accuracy: 1.3466323618507294
Recommended games for Ys Origin:

1. Rage of the Dragons
2. BlazBlue Cross Tag Battle
3. Romancing SaGa 3
4. Triangle Strategy
5. Castle Crashers Remastered


Question-2 How can the SFU gaming society efficiently optimize their game purchasing costs by utilizing the games that students are currently playing?

In [9]:

import nltk
import re
import nltk
nltk.download('punkt')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster, cophenet, dendrogram
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from nltk.stem.snowball import SnowballStemmer
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity



# Create an English language SnowballStemmer object
stemmer = SnowballStemmer("english")

# Define a function to perform both stemming and tokenization
def tokenize_and_stem(text):
    tokens = []
    sentences = nltk.sent_tokenize(text)
    i = 0
    while i < len(sentences):
        words = nltk.word_tokenize(sentences[i])
        j = 0
        while j < len(words):
            tokens.append(words[j])
            j += 1
        i += 1

    i = 0
    filtered_tokens = []
    while i < len(tokens):
        if re.search('[a-zA-Z]', tokens[i]):
            filtered_tokens.append(tokens[i])
        i += 1

    stems = []
    i = 0
    while i < len(filtered_tokens):
        stems.append(stemmer.stem(filtered_tokens[i]))
        i += 1
        
    return stems


# TfidfVectorizer with hyperparameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=5010,
                                   min_df=0.49, stop_words='english',
                                   use_idf=True, tokenizer=tokenize_and_stem,
                                   ngram_range=(1,3))


# Fit and transform the tfidf_vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(games_df["Plots"])

# Apply PCA to reduce dimensionality
pca = PCA(n_components=3)
tfidf_matrix_pca = pca.fit_transform(tfidf_matrix.toarray())

# Hierarchical clustering with hyperparameters
mergings = linkage(tfidf_matrix_pca, method='ward')

titles = games_df['Title']

# Create a dictionary of title indices
title_indices = {}
for i, title in enumerate(titles):
    title_indices[title] = i

titles = games_df['Title'].values
dendrogram_ = dendrogram(mergings,
                         labels=titles,
                         leaf_rotation=90,
                         leaf_font_size=16)
# Color the labels red
ax = plt.gca()
for tick_label in ax.get_xticklabels():
    title = tick_label.get_text()
    index = title_indices.get(title)
    if index is not None:
        tick_label.set_color('red')

# Resize the figure
fig = plt.gcf()
fig.set_size_inches(108, 21)


# Show the plot
plt.show()

# Calculate the cophenetic correlation coefficient
c, coph_dists = cophenet(mergings, pdist(tfidf_matrix_pca))

print("The cophenetic correlation coefficient is:", c)

# Assign each game to a cluster
n_clusters = 25
cluster_labels = fcluster(mergings, n_clusters, criterion='maxclust')

# Add the cluster labels to the games dataframe
games_df['Cluster'] = cluster_labels

# Calculate silhouette score and Calinski-Harabasz index
silhouette_avg = silhouette_score(tfidf_matrix_pca, cluster_labels)
ch_score = calinski_harabasz_score(tfidf_matrix_pca, cluster_labels)

print("Silhouette score:", silhouette_avg)
print("Calinski-Harabasz index:", ch_score)

# Recommendation example for optimised pricing with input from user
title = '1-2-Switch'

# Find the cluster for the user's input title
user_cluster = games_df[games_df['Title'] == title]['Cluster'].values[0]

# Get the games in the same cluster as the user's input title
cluster_games = games_df[games_df['Cluster'] == user_cluster]

# Sort the games in the cluster by price
cluster_games = cluster_games.sort_values(by='Price')

# Get the 5 games with the lowest price
recommended_games = cluster_games[['Title', 'Price']].iloc[:5].values.tolist()

# Print the recommended games
print("Recommended games:")
for game in recommended_games:
    print(f"Title: {game[0]}\tPrice: {game[1]}")

# Display the games in each cluster
for i in range(1, n_clusters+1):
    print(f"Games in Cluster {i}:")
    print(games_df[games_df['Cluster'] == i]['Title'].values)



Output hidden; open in https://colab.research.google.com to view.

Question 3 How can we determine whether a game shortlisted by SFSS society which is optimised according to price, will be successful or not?

In [10]:
# How can we determine whether a game shortlisted by SFSS society which is optimised according to price, will be successful or not?
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Check the data type of the "Released Date" column and convert it to a string format if necessary
if games_df["Released Date"].dtype != "object":
    games_df["Released Date"] = games_df["Released Date"].astype(str)

# Feature Engineering: Extract the year from the "Released Date" column
games_df["Released Date"] = games_df["Released Date"].str.extract('(\w+\s\d{1,2},\s\d{4})', expand=False)

# Convert "Released Date" to datetime format
games_df["Released Date"] = pd.to_datetime(games_df["Released Date"])

# Extract year from "Released Date"
games_df["Year"] = games_df["Released Date"].dt.year

# Convert categorical features to numerical representations using one-hot encoding
genre_dummies = pd.get_dummies(games_df["Genre"], prefix="Genre")
developer_dummies = pd.get_dummies(games_df["Developer"], prefix="Developer")
publisher_dummies = pd.get_dummies(games_df["Publisher"], prefix="Publisher")

X = pd.concat([games_df[["Year", "Price"]], genre_dummies, developer_dummies, publisher_dummies], axis=1)

# Define the target variable based on the rating
threshold = 7
y = (games_df["Rating"] >= threshold).astype(int)

# Impute missing values using mean imputation
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a decision tree classifier to predict the game's success based on the selected features
model = DecisionTreeClassifier(max_depth=4, random_state=42)
model.fit(X_train, y_train)

# Evaluate the performance of the model
y_pred = model.predict(X_test)

print("Prediction Games evaluation matrices:")
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))


Prediction Games evaluation matrices:
Precision: 0.7458333333333333
Recall: 0.9675675675675676
F1 score: 0.8423529411764705
