# Database Creation

In [70]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, DecimalType, ArrayType, BinaryType
from pyspark.sql.functions import max, min
from pyspark.sql.functions import avg
from pyspark.sql.functions import explode, split, count, collect_set
from pyspark.sql.functions import sum

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Data Processing with Spark") \
    .getOrCreate()

# Define schema for reviews
review_schema = StructType([
    StructField("ReviewText", StringType(), True),
    StructField("Review", StringType(), True),
    StructField("ReviewLength", IntegerType(), True),
    StructField("PlayHours", FloatType(), True),
    StructField("DatePosted", StringType(), True), 
    StructField("Game", StringType(), True)
])

# Define schema for pricing
pricing_schema = StructType([
    StructField("Currency", StringType(), True),
    StructField("Current Price", FloatType(), True),  
    StructField("Converted Price", FloatType(), True),
    StructField("Lowest Recorded Price", FloatType(), True),
    StructField("Game", StringType(), True)
])


# Define schema for game information
info_schema = StructType([
    StructField("Game", StringType(), True),
    StructField("Developer", StringType(), True),
    StructField("In-Game Count", IntegerType(), True),
    StructField("Tags", StringType(), True),
    StructField("Categories", StringType(), True) 
])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/12 16:31:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [71]:
# Load your data using the schema
df_reviews = spark.read.schema(review_schema).parquet('../Data Cleaning/df_combined.parquet')
df_pricing = spark.read.schema(pricing_schema).parquet('../Data Cleaning/pricing_combined.parquet')
df_info = spark.read.schema(info_schema).parquet('../Data Cleaning/combined_info.parquet')

In [72]:
# Check unique game names in each DataFrame
df_reviews.select("Game").distinct().show()
df_pricing.select("Game").distinct().show()
df_info.select("Game").distinct().show()

                                                                                

+----------------+
|            Game|
+----------------+
|lethal companies|
|        palworld|
|       craftopia|
+----------------+

+----------------+
|            Game|
+----------------+
|lethal companies|
|        palworld|
|       craftopia|
+----------------+

+----------------+
|            Game|
+----------------+
|lethal companies|
|        palworld|
|       craftopia|
+----------------+



In [100]:
# Show the first three rows of the df_reviews DataFrame
df_pricing.show(3, truncate=False)

+------------------+-------------+---------------+---------------------+----------------+
|Currency          |Current Price|Converted Price|Lowest Recorded Price|Game            |
+------------------+-------------+---------------+---------------------+----------------+
|British Pound     |8.5          |8.5            |5.953                |Lethal Companies|
|Russian Ruble     |385.0        |3.29           |2.3                  |Lethal Companies|
|South African Rand|100.0        |4.24           |2.97                 |Lethal Companies|
+------------------+-------------+---------------+---------------------+----------------+
only showing top 3 rows



In [101]:
df_reviews.show(3)

+--------------------+-----------+------------+---------+-------------------+---------+
|          ReviewText|     Review|ReviewLength|PlayHours|         DatePosted|     Game|
+--------------------+-----------+------------+---------+-------------------+---------+
|30H on steam (cur...|Recommended|         205|     63.7| Posted: February 9|Craftopia|
|Craftopia is an a...|Recommended|        1080|     35.0|Posted: February 19|Craftopia|
|                 gud|Recommended|          20|     22.7|   Posted: March 25|Craftopia|
+--------------------+-----------+------------+---------+-------------------+---------+
only showing top 3 rows



In [103]:
df_info.show(3)

+----------------+----------+-------------+--------------------+--------------------+
|            Game| Developer|In-Game Count|                Tags|          Categories|
+----------------+----------+-------------+--------------------+--------------------+
|        Palworld|Pocketpair|        60281| Multiplayer,  Op...|Single-player, On...|
|       Craftopia|Pocketpair|          335| Open World,  Cra...|Single-player, On...|
|Lethal Companies|  Zeekerss|        22876| Online CoOp,  Ho...|Single-player, On...|
+----------------+----------+-------------+--------------------+--------------------+



In [112]:
# Calculate the number of rows in the df_palworld DataFrame
num_rows1 = df_reviews.count()
num_rows2 = df_pricing.count()
num_rows3 = df_info.count()

# Print the result
print("Number of rows in df_reviews:", num_rows1)
print("Number of rows in df_pricing:", num_rows2)
print("Number of rows in df_info:", num_rows3)

Number of rows in df_reviews: 450
Number of rows in df_pricing: 123
Number of rows in df_info: 3


# Some Sample Queries

## Find the maximum and minimum of prices for each game in pounds

In [114]:
# Finding the maximum and minimum recorded prices per game
price_range = df_pricing.groupBy("Game").agg(
    max("Converted Price").alias("Max Price"),
    min("Converted Price").alias("Min Price")
)
price_range.show()

+----------------+---------+---------+
|            Game|Max Price|Min Price|
+----------------+---------+---------+
|       Craftopia|    14.76|     4.02|
|        Palworld|     26.5|     7.47|
|Lethal Companies|     9.66|     3.29|
+----------------+---------+---------+



## Average Play Hours per Game

In [115]:
# Calculating average play hours per game
average_play_hours = df_reviews.groupBy("Game").agg(avg("PlayHours").alias("Average Play Hours"))
average_play_hours.show()

+----------------+------------------+
|            Game|Average Play Hours|
+----------------+------------------+
|       Craftopia|52.266000073452794|
|        Palworld| 72.92000024278958|
|Lethal Companies| 42.10600012938182|
+----------------+------------------+



## Listing Developers and Their Games

In [117]:
df_info.select("Developer", "Game").distinct().groupBy("Developer").agg(
    collect_set("Game").alias("Games")
).show(truncate=False)

+----------+---------------------+
|Developer |Games                |
+----------+---------------------+
|Zeekerss  |[Lethal Companies]   |
|Pocketpair|[Palworld, Craftopia]|
+----------+---------------------+



## Count the total of review text per game

In [122]:
# Summing review lengths per game
df_reviews.groupBy("Game").agg(
    sum("ReviewLength").alias("Total Review Words")
).show()


+----------------+------------------+
|            Game|Total Review Words|
+----------------+------------------+
|       Craftopia|             45604|
|        Palworld|             16528|
|Lethal Companies|             12616|
+----------------+------------------+



## Counting total 'Recommended' and 'Not Recommended'

In [123]:
# Grouping by 'Game' and counting total 'Recommended' and 'Not Recommended'
df_reviews.groupBy("Game").pivot("Review").count().show()

+----------------+---------------+-----------+
|            Game|Not Recommended|Recommended|
+----------------+---------------+-----------+
|       Craftopia|             60|         90|
|        Palworld|              9|        141|
|Lethal Companies|              8|        142|
+----------------+---------------+-----------+



## Queries from three tables

In [124]:
# Create temporary views for each DataFrame
df_reviews.createOrReplaceTempView("reviews")
df_pricing.createOrReplaceTempView("pricing")
df_info.createOrReplaceTempView("info")


In [127]:
query = """
SELECT 
    p.Game,
    MAX(p.`Converted Price`) AS Max_Converted_Price,
    FIRST(p.Currency) AS Currency,
    AVG(r.PlayHours) AS Average_PlayHours,
    i.Developer
FROM 
    pricing p
JOIN 
    reviews r ON p.Game = r.Game
JOIN 
    info i ON p.Game = i.Game
GROUP BY 
    p.Game, i.Developer
"""

result_df = spark.sql(query)
result_df.show(truncate=False)


+----------------+-------------------+-----------+------------------+----------+
|Game            |Max_Converted_Price|Currency   |Average_PlayHours |Developer |
+----------------+-------------------+-----------+------------------+----------+
|Craftopia       |14.76              |Swiss Franc|52.266000073452794|Pocketpair|
|Lethal Companies|9.66               |Swiss Franc|42.10600012938182 |Zeekerss  |
|Palworld        |26.5               |Swiss Franc|72.92000024278958 |Pocketpair|
+----------------+-------------------+-----------+------------------+----------+



In [128]:
spark.stop()

In [67]:
# Import the files
df_reviews = pd.read_parquet('../Data Cleaning/df_combined.parquet')
df_pricing = pd.read_parquet('../Data Cleaning/pricing_combined.parquet')
df_info = pd.read_parquet('../Data Cleaning/combined_info.parquet')

In [68]:
df_info

Unnamed: 0,Game,Developer,In-Game Count,Tags,Categories
0,Lethal Companies,Zeekerss,22876,"Online CoOp, Horror, FirstPerson, Coop, S...","Single-player, Online Co-op, Family Sharing, E..."
1,Craftopia,Pocketpair,335,"Open World, Crafting, Survival, Multiplaye...","Single-player, Online Co-op, LAN Co-op, Steam ..."
2,Palworld,Pocketpair,60281,"Multiplayer, Open World, Survival, Creatur...","Single-player, Online Co-op, Steam Achievement..."


# RAG and LLM Implementataion

In [1]:
import pandas as pd
import requests
import json
import numpy as np
import string
import re
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Markdown, display

In [43]:
# Import the files
df_reviews = pd.read_parquet('../Data Cleaning/df_combined.parquet')
df_pricing = pd.read_parquet('../Data Cleaning/pricing_combined.parquet')
df_info = pd.read_parquet('../Data Cleaning/combined_info.parquet')

In [44]:
api_key='0QR34tIcP2MW4q29s8GyLAze5K1RAYeN6BWaOrfLdYld'

In [45]:
def get_resp_oai(input_text, model):
    url = "https://llm.api.ai8.io/query_llm"
    data = {
        # Specify the model that you want to use
        "model": model,
        "messages": [
                    {"role": "system", "content": "You are a Financial Analyst at a Stock Market Firm. You are good at coding and solving coding errors."},
                    {"role": "user", "content": input_text}
        ]
    }
    headers = {'Authorization': api_key}
    response = requests.post(url, json=data, headers=headers)
    if response.status_code == 200:
        response_data = json.loads(response.content)
        model_response = extract_message_oai(response_data)
        return model_response
    else:
        return {"statusCode": response.status_code, "body": response.content}

def extract_message_oai(response_data):
    message_content = response_data.get("choices", [])[0].get("message", {}).get("content", "")
    # format the extracted message as markdown
    markdown_content = "---\n\n" + message_content + "\n\n---"
    return markdown_content
    

This is the main idea of how 'context' will be implemented 

In [46]:
question = """
Who is the developer of Palworld?
"""
context = '''
Game	Developer	In-Game Count	Tags	Categories
0	Lethal Companies	Zeekerss	22876	Online CoOp, Horror, FirstPerson, Coop, S...	Single-player, Online Co-op, Family Sharing, E...
1	Craftopia	Pocketpair	335	Open World, Crafting, Survival, Multiplaye...	Single-player, Online Co-op, LAN Co-op, Steam ...
2	Palworld	Pocketpair	60281	Multiplayer, Open World, Survival, Creatur...	Single-player, Online Co-op, Steam Achievement..."
'''

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_0 = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg_0))

<div style='color: #34568B;'>

---

The developer of Palworld is Pocketpair.

---

In [47]:
df_info

Unnamed: 0,game,developer,in-game count,tags,categories
0,lethal companies,zeekerss,22876,"online coop, horror, firstperson, coop, s...","single-player, online co-op, family sharing, e..."
1,craftopia,pocketpair,335,"open world, crafting, survival, multiplaye...","single-player, online co-op, lan co-op, steam ..."
2,palworld,pocketpair,60281,"multiplayer, open world, survival, creatur...","single-player, online co-op, steam achievement..."


In [48]:
df_info.to_csv("df4.txt", sep='\t', index=False)

In [49]:
df_pricing

Unnamed: 0,currency,current price,converted price,lowest recorded price,game
0,british pound,22.490999,22.49,22.490999,palworld
1,south asia - usd,9.441000,7.47,7.470000,palworld
2,russian ruble,99010.000000,8.47,8.470000,palworld
3,south african rand,243.001007,10.31,10.310000,palworld
4,cis - u.s. dollar,13.041000,10.32,10.320000,palworld
...,...,...,...,...,...
118,u.s. dollar,9.990000,7.90,5.530000,lethal companies
119,norwegian krone,,8.12,5.680000,lethal companies
120,euro,,8.36,5.850000,lethal companies
121,polish zloty,,9.21,6.450000,lethal companies


In [50]:
# Save the DataFrame to a CSV file
df_pricing.to_csv("df5.txt", sep='\t', index=False)

In [51]:
df_reviews

Unnamed: 0,reviewtext,review,reviewlength,playhours,dateposted,game
0,good game,recommended,25,34.099998,posted: april 5,lethal companies
1,"very lethal, 10/10",recommended,33,19.900000,posted: april 5,lethal companies
2,pretty fun game especially whit mods and frien...,recommended,99,48.700001,posted: april 5,lethal companies
3,good,recommended,21,33.900002,posted: april 5,lethal companies
4,"i saw my friend being eaten alive, while my ot...",recommended,84,58.400002,posted: april 5,lethal companies
...,...,...,...,...,...,...
445,there's not getting around it. it's breath of ...,recommended,250,13.600000,posted: february 7,craftopia
446,in ea since more than 3 years. didnt add much ...,not recommended,112,6.600000,posted: february 1,craftopia
447,considering the developers currently have 3 ea...,not recommended,315,27.200001,posted: january 25,craftopia
448,rest well my sweet prince. you were so fun bef...,not recommended,441,8.500000,posted: january 25,craftopia


In [52]:
# Save the DataFrame to a CSV file
df_reviews.to_csv("df_reviews.txt", sep='\t', index=False)

In [53]:
# Open and read the content of the file into a string variable
with open('df4.txt', 'r', encoding='utf-8') as file:
    df4 = file.read()

In [54]:
df4

'game\tdeveloper\tin-game count\ttags\tcategories\nlethal companies\tzeekerss\t22876\t online coop,  horror,  firstperson,  coop,  survival horror,  psychological horror,  exploration,  scifi,  funny,  pve,  atmospheric,  procedural generation,  aliens,  time management,  adventure,  dungeon crawler,  action,  early access,  actionadventure,  perma death\tsingle-player, online co-op, family sharing, early access, multi-player, co-op, partial controller support\ncraftopia\tpocketpair\t335\t open world,  crafting,  survival,  multiplayer,  open world survival craft,  rpg,  sandbox,  building,  online coop,  adventure,  action,  early access,  coop,  anime,  character customization,  indie,  hack and slash,  automation,  singleplayer,  third person\tsingle-player, online co-op, lan co-op, steam achievements, steam workshop, remote play on tablet, family sharing, early access, multi-player, co-op, partial controller support\npalworld\tpocketpair\t60281\t multiplayer,  open world,  survival

In [55]:
question = """
What are the tags of Palworld that are the same with lethal companies?
"""
context = df4

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_1 = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg_1))

<div style='color: #34568B;'>

---

The tags of "Palworld" that are the same as those of "Lethal Companies" are: 
- coop
- adventure
- action
- early access

---

In [56]:
# Open and read the content of the file into a string variable
with open('df5.txt', 'r', encoding='utf-8') as file:
    df5 = file.read()

In [57]:
question = """
What are the country with the highest price converted of Craftopia?
"""
context = df5

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_2 = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg_2))

<div style='color: #34568B;'>

---

The country with the highest converted price of Craftopia, based on the given data, is Switzerland, with a converted price of 14.76.

---

In [59]:
question = """
what is the name of the current London mayor?
"""
context = df5

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_3 = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg_3))

<div style='color: #34568B;'>

---

I don't know.

---

In [60]:
# Open and read the content of the combined file into a string variable
with open('df_reviews.txt', 'r', encoding='utf-8') as file:
    test_review = file.read()

In [63]:
question = """
What is the longest playhours in the lethal companies?
"""
context = test_review

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_4 = get_resp_oai(full, "gpt-4-0125-preview")

msg_4

{'statusCode': 400, 'body': b'{"message": "Input too long!"}'}

the input is to long and LLM can't process it

In [64]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define a function for text preprocessing
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove emoticons
    text = re.sub(r':\)|:-\)|:\(|:-\(', '', text)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove punctuation
    text = re.sub(r'[.,!?*]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply the preprocessing function to the review text column
df_reviews['reviewtext'] = df_reviews['reviewtext'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [65]:
df_reviews

Unnamed: 0,reviewtext,review,reviewlength,playhours,dateposted,game
0,good game,recommended,25,34.099998,posted: april 5,lethal companies
1,lethal 10/10,recommended,33,19.900000,posted: april 5,lethal companies
2,pretty fun game especially whit mod friend : (...,recommended,99,48.700001,posted: april 5,lethal companies
3,good,recommended,21,33.900002,posted: april 5,lethal companies
4,saw friend eaten alive friend hit griddy next,recommended,84,58.400002,posted: april 5,lethal companies
...,...,...,...,...,...,...
445,'s getting around 's breath wild rather breath...,recommended,250,13.600000,posted: february 7,craftopia
446,ea since 3 year didnt add much game buggy rele...,not recommended,112,6.600000,posted: february 1,craftopia
447,considering developer currently 3 early access...,not recommended,315,27.200001,posted: january 25,craftopia
448,rest well sweet prince fun seamless world upda...,not recommended,441,8.500000,posted: january 25,craftopia


the table is then splitted so it fits to the LLM context

In [66]:
# Calculate total recommend, total not recommend, and average review length per game
aggregated_data = df_reviews.groupby('game').agg({
    'review': lambda x: (x == 'recommended').sum(),  # Count recommend
    'reviewlength': 'mean'  # Calculate average review length
})

# Rename columns
aggregated_data.columns = ['total_recommend', 'average_review_length']

# Add total not recommend column
aggregated_data['total_not_recommend'] = df_reviews.groupby('game')['review'].apply(lambda x: (x == 'not recommended').sum())

# Reset index
aggregated_data = aggregated_data.reset_index()
aggregated_data

Unnamed: 0,game,total_recommend,average_review_length,total_not_recommend
0,craftopia,90,304.026667,60
1,lethal companies,142,84.106667,8
2,palworld,141,110.186667,9


In [67]:
aggregated_data.to_csv("df6.txt", sep='\t', index=False)

split the review into several chunk based on the game name

In [68]:
# List of unique game names
game_names = df_reviews['game'].unique()

# Create separate DataFrames for each game and store them in a dictionary
game_dfs = {}
for i, game_name in enumerate(game_names):
    game_data = df_reviews[df_reviews['game'] == game_name][['game', 'playhours', 'reviewtext']]
    game_dfs[f"df{i+1}"] = game_data

# Display the DataFrames for each game
for df_name, df_data in game_dfs.items():
    print(f"DataFrame: {df_name}")
    print(df_data)
    print("\n")

# Export each DataFrame to a separate text file
for df_name, df_data in game_dfs.items():
    # Define the file name based on the DataFrame name
    file_name = f"{df_name}.txt"
    # Export the DataFrame to a text file
    df_data.to_csv(file_name, sep='\t', index=False)
    print(f"DataFrame '{df_name}' exported to '{file_name}'")


DataFrame: df1
                 game   playhours  \
0    lethal companies   34.099998   
1    lethal companies   19.900000   
2    lethal companies   48.700001   
3    lethal companies   33.900002   
4    lethal companies   58.400002   
..                ...         ...   
145  lethal companies   16.900000   
146  lethal companies  107.199997   
147  lethal companies   15.300000   
148  lethal companies   24.900000   
149  lethal companies   38.599998   

                                            reviewtext  
0                                            good game  
1                                         lethal 10/10  
2    pretty fun game especially whit mod friend : (...  
3                                                 good  
4        saw friend eaten alive friend hit griddy next  
..                                                 ...  
145                                    game make oiled  
146  slot machine go brrrr mod giant robot go brrrr...  
147                        

In [69]:
# Read each chunk into a separate variable
with open('df1.txt', 'r') as file:
    df1 = file.read()

with open('df2.txt', 'r') as file:
    df2 = file.read()

with open('df3.txt', 'r') as file:
    df3 = file.read()

with open('df6.txt', 'r') as file:
    df6 = file.read()


In [71]:
question = """
What is the longest playhours in the lethal companies?
"""
context = df1

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_5 = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg_5))

<div style='color: #34568B;'>

---

The longest playhours in the game "Lethal Companies" is 294.7 hours.

---

now the input is no longer too long and can be processed

In [72]:
question = """
How many recommend and not recommend are there in lethal companies?
"""
context = df6

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_6 = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg_6))

<div style='color: #34568B;'>

---

For "lethal companies," there are 142 recommendations and 8 not recommendations.

---

# Embedding to automatically pick the best sources

In [74]:
# Sample data
questions = "What is the highest playhours for Palworld?"
sources = [df1, df2, df3, df4, df5, df6]

#source 1 (df1) = review of lethal companies
#source 2 (df2) = review of palworld
#source 3 (df3) = review of craftopia
#source 4 (df4) = information about three games in general
#source 5 (df5) = pricing of three games
#source 6 (df6) = total of recommend and not recommend of each game

# Vectorizing the sources and question
vectorizer = TfidfVectorizer()
source_vectors = vectorizer.fit_transform(sources)
question_vector = vectorizer.transform([questions])

# Calculating cosine similarity
cos_similarities = cosine_similarity(question_vector, source_vectors)

# Selecting the source with the highest cosine similarity score
best_source_index = cos_similarities.argmax()
best_source = sources[best_source_index]

# Output the best source
print("Selected Source:", 'df'+ str(int(best_source_index)+1))

Selected Source: df2


In [76]:
# make the code above into a function
def select_best_source(question, sources):
    # Vectorizing the sources and question
    vectorizer = TfidfVectorizer()
    source_vectors = vectorizer.fit_transform(sources)
    question_vector = vectorizer.transform([question])

    # Calculating cosine similarity
    cos_similarities = cosine_similarity(question_vector, source_vectors)

    # Selecting the source with the highest cosine similarity score
    best_source_index = cos_similarities.argmax()
    best_source = sources[best_source_index]
    
    # Output the best source
    print("Selected Source:", 'df'+ str(int(best_source_index)+1))
    return best_source

In [78]:
question = """
What is the highest playhours for Palworld?
"""
context = select_best_source(question, sources)

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg_7 = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg_7))

Selected Source: df2


<div style='color: #34568B;'>

---

The highest playhours for Palworld is 794.0.

---

# Doing the analysis using LLM with RAG

## Find the maximum and minimum of prices for each game in pounds

In [92]:
question = """
What is the maximum and minimum of prices for lethal companies, palworld and craftopia in converted price?
"""
context = select_best_source(question, sources)

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg))

Selected Source: df5


<div style='color: #34568B;'>

---

To determine the maximum and minimum converted prices for "Lethal Companies," "Palworld," and "Craftopia" from the provided information, we will extract the relevant data and analyze it.

### For Lethal Companies:
- **Maximum Converted Price:** 9.66 (Swiss Franc)
- **Minimum Converted Price:** 3.29 (Russian Ruble)

### For Palworld:
- **Maximum Converted Price:** 26.5 (Swiss Franc)
- **Minimum Converted Price:** 7.47 (South Asia - USD)

### For Craftopia:
- **Maximum Converted Price:** 14.76 (Swiss Franc)
- **Minimum Converted Price:** 4.02 (South Asia - USD)

This analysis is based on the "converted price" column of each game category within the provided data.

---

## Average Play Hours per Game

In [94]:
question = """
What is the average play hours lethal companies?
"""
context = select_best_source(question, sources)

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg))

Selected Source: df1


<div style='color: #34568B;'>

---

To find the average play hours for "Lethal Companies," we need to sum all the play hours listed for the game and then divide by the number of entries. Let me calculate that for you.

First, let's aggregate the play hours provided in the data.

The sum of the provided play hours is: 

34.1 + 19.9 + 48.7 + 33.9 + 58.4 + 19.6 + 4.6 + 27.9 + 12.7 + 6.1 + 123.2 + 40.3 + 10.4 + 9.6 + 77.4 + 163.3 + 9.7 + 13.7 + 1.0 + 34.7 + 93.7 + 26.5 + 78.4 + 15.2 + 12.8 + 63.8 + 63.9 + 5.2 + 5.0 + 39.9 + 16.8 + 40.9 + 56.7 + 8.8 + 2.1 + 5.1 + 8.1 + 34.9 + 13.0 + 8.4 + 129.0 + 29.2 + 4.0 + 74.9 + 60.4 + 8.3 + 48.3 + 97.4 + 36.2 + 148.9 + 140.7 + 63.1 + 95.8 + 19.1 + 52.8 + 21.4 + 2.0 + 28.8 + 58.9 + 11.8 + 24.5 + 98.2 + 104.5 + 56.8 + 9.2 + 3.2 + 13.6 + 4.0 + 56.1 + 36.5 + 11.1 + 15.0 + 29.5 + 28.4 + 3.5 + 50.2 + 31.6 + 12.9 + 229.3 + 5.9 + 15.0 + 26.6 + 41.2 + 107.0 + 22.3 + 16.3 + 26.1 + 55.3 + 14.8 + 151.1 + 3.6 + 55.8 + 6.3 + 3.9 + 8.4 + 34.1 + 2.0 + 31.6 + 20.8 + 40.9 + 78.9 + 9.3 + 52.5 + 13.4 + 19.1 + 5.1 + 33.7 + 175.2 + 150.6 + 5.4 + 51.3 + 20.8 + 44.1 + 32.4 + 52.1 + 84.9 + 30.3 + 11.5 + 7.0 + 0.5 + 46.8 + 63.3 + 54.3 + 8.2 + 34.9 + 17.8 + 22.8 + 28.6 + 31.6 + 43.5 + 12.9 + 58.2 + 130.2 + 61.0 + 2.4 + 129.6 + 91.4 + 11.5 + 21.0 + 33.6 + 10.6 + 14.6 + 294.7 + 20.3 + 4.6 + 16.9 + 107.2 + 15.3 + 24.9 + 38.6.

After calculating the sum, we divide by the total number of game entries (all the individual data points you've provided).

Since exact calculations and summing such a long list of numbers require precision that might not be error-free when manually done or visualized, I'll provide you with a conceptual way to approach this calculation using programming (e.g., Python).

Assuming the data is in a CSV format and each play hour is a line item in the "playhours" column, you can use the following pseudo-code to calculate the average:

```python
import pandas as pd

# Assuming `data.csv` is your file and it has the structure you've provided
df = pd.read_csv('data.csv')

average_play_hours = df['playhours'].mean()
print(average_play_hours)
```

This code reads the dataset, calculates the mean (average) of the "playhours" column, and prints the result.

Given the extensive list, precise manual calculation is prone to error, and utilizing a script ensures accuracy. Running the above script [or performing a manual calculation, if preferred, taking care to sum all these numbers and then divide by their count (the number of entries)] will provide the accurate average play hours for "Lethal Companies". 

If you require the exact number, please consider executing the code with your dataset in a Python environment.

---

In [95]:
question = """
What is the average play hours craftopia? 
"""
context = select_best_source(question, sources)

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg))

Selected Source: df3


<div style='color: #34568B;'>

---

To find the average play hours for Craftopia from the provided data, we can sum up the total hours played and divide it by the number of reviews given. Let's calculate it accordingly:

- Sum of play hours: 0.9 + 63.7 + 35.0 + 22.7 + 3.3 + 79.9 + 10.8 + 57.9 + 14.0 + 1.3 + 69.0 + 14.6 + 32.4 + 0.3 + 124.4 + 135.4 + 1.0 + 8.1 + 10.2 + 7.4 + 59.0 + 0.6 + 16.6 + 52.7 + 58.1 + 8.5 + 69.2 + 50.0 + 12.1 + 24.4 + 73.7 + 17.0 + 21.5 + 30.4 + 22.2 + 6.8 + 27.5 + 43.5 + 20.3 + 5.5 + 20.3 + 55.5 + 3.4 + 23.7 + 233.4 + 62.8 + 62.0 + 221.4 + 87.6 + 0.6 + 73.8 + 0.3 + 49.0 + 3.7 + 7.0 + 53.8 + 17.9 + 77.1 + 14.9 + 35.1 + 5.6 + 183.3 + 7.4 + 16.9 + 74.2 + 1.0 + 205.0 + 21.1 + 8.6 + 73.4 + 1.1 + 57.8 + 0.3 + 9.5 + 11.0 + 8.0 + 235.0 + 1.2 + 20.8 + 108.9 + 224.2 + 17.3 + 44.1 + 45.0 + 45.7 + 19.5 + 7.1 + 112.3 + 75.7 + 65.3 + 13.6 + 6.6 + 27.2 + 8.5 + 31.8 = 4763.5 hours
- Number of entries: 157

Average play hours = Total play hours / Number of entries = 4763.5 / 157 = Approximately 30.34 hours

Therefore, the average play hours for Craftopia, based on the provided data, is approximately 30.34 hours.

---

In [96]:
question = """
What is the average play hours palworld?
"""
context = select_best_source(question, sources)

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg))

Selected Source: df2


<div style='color: #34568B;'>

---

To find the average play hours for Palworld, we'll need to calculate the total hours played and divide that by the number of entries. Let's perform this calculation. 

Given the data:
- **Total Entries**: Count of all play hour entries.
- **Sum of Play Hours**: Total play hours across all entries.


```python
# Assuming each line in the given data represents an entry and "playhours" is the second value on each line.

play_hours = [
    90.3, 98.0, 252.7, 7.0, 219.1, 19.5, 7.6, 195.1, 190.8, 33.4, 27.4, 122.1,
    14.6, 24.4, 79.7, 25.0, 47.7, 56.3, 40.6, 12.5, 72.2, 38.1, 14.1, 5.1, 0.7,
    55.3, 8.9, 5.9, 50.1, 52.1, 64.0, 5.0, 182.4, 26.2, 72.7, 36.9, 11.1, 10.6,
    261.1, 60.6, 53.0, 49.7, 79.9, 794.0, 10.1, 63.4, 37.0, 18.5, 6.5, 49.9,
    74.0, 9.4, 61.9, 17.4, 7.4, 76.4, 85.9, 26.8, 31.8, 37.3, 55.3, 309.1, 5.7,
    46.4, 49.6, 97.3, 221.0, 121.8, 57.6, 58.6, 207.6, 130.1, 14.7, 195.5, 116.1,
    37.9, 27.3, 212.3, 26.5, 69.4, 76.1, 25.9, 37.9, 4.9, 46.6, 210.4, 13.2,
    104.7, 34.5, 25.2, 10.0, 85.2, 32.1, 88.3, 334.1, 42.5, 90.0, 26.5, 10.2,
    116.0, 8.3, 31.3, 47.1, 97.0, 156.0, 4.9, 5.0, 103.2, 137.3, 108.8, 16.2, 5.5,
    109.0, 99.5, 35.6, 39.9, 129.5, 69.9, 47.9, 25.8, 71.1, 17.3, 131.8, 55.3,
    121.9, 99.1, 68.1, 82.6, 21.1, 51.5, 17.1, 21.1, 210.6, 94.9, 45.9, 18.4,
    176.3, 11.0, 64.7, 119.6, 45.6, 124.3, 12.5, 60.0, 43.7, 84.4, 13.7, 71.1,
    17.8, 26.5
]

total_play_hours = sum(play_hours)
average_play_hours = total_play_hours / len(play_hours)

average_play_hours
```

Since I can't execute the code, please run this Python code in a Python environment (like a Jupyter notebook, Python interpreter, etc.) to calculate the average play hours for Palworld based on the given dataset.

---

## Counting total 'Recommended', 'Not Recommended' and Count the total of review text per game

In [104]:
question = """
Can you list to me the average_review_length, total_recommend, and total_recommend for each craftopia, palworld and lethal companies?
"""
context = select_best_source(question, sources)

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg))

Selected Source: df6


<div style='color: #34568B;'>

---

Based on the provided information, here are the requested details:

For **Craftopia**:
- **Average Review Length**: 304.02666666666664
- **Total Recommend**: 90
- **Total Not Recommend**: 60

For **Palworld**:
- **Average Review Length**: 110.18666666666667
- **Total Recommend**: 141
- **Total Not Recommend**: 9

For **Lethal Companies**:
- **Average Review Length**: 84.10666666666667
- **Total Recommend**: 142
- **Total Not Recommend**: 8

---

## Listing Developers and Their Games

In [109]:
question = """
What are the games with tag of 'single-player, online co-op, lan co-op, steam achievements, steam workshop' 
and what are the developers, also list the other games that developed by the same developers. 
For other developer, mention the game name and several tags
"""
context = select_best_source(question, sources)

full = f"Based on this:'{context}', answer this: {question}, if the information is not from context, say 'I don't know'"

msg = get_resp_oai(full, "gpt-4-0125-preview")

display(Markdown("<div style='color: #34568B;'>\n\n" + msg))

Selected Source: df4


<div style='color: #34568B;'>

---

Based on the information provided:

- The game with tags 'single-player, online co-op, lan co-op, steam achievements, steam workshop' is **Craftopia**, developed by **Pocketpair**.
- Other games developed by Pocketpair listed in the provided data are **Craftopia** and **Palworld**.

For **Palworld** by Pocketpair, some tags include 'multiplayer, open world, survival, creature collector, crafting, coop, adventure, sandbox, third-person shooter, action, early access, rpg, anime, pve, indie, hack and slash'.

For **Craftopia**, it shares developers with **Palworld**, indicating Pocketpair focuses on crafting, multiplayer, open-world, and survival genres, among others, as seen in the tags and categories for both games.

For other developers mentioned, **Zeekerss** developed **Lethal Companies**, with tags like 'online coop, horror, first-person, coop, survival horror, psychological horror, exploration, sci-fi, funny, pve, atmospheric, procedural generation, aliens, time management, adventure, dungeon crawler, action, early access, action adventure, perma death'.

---