In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import re
import nltk
import os
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
import pickle
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LassoCV
from wordcloud import WordCloud
from myFunction import *

plt.style.use('ggplot')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [8]:
# Data loading
media_df = pd.read_csv("data/myMediaReviews.csv")
user_df = pd.read_csv("data/myUserReviews.csv")
media_df.dropna(subset=["Snippet"], inplace=True)
user_df.dropna(subset=["Review"], inplace=True)
user_df.drop(columns=["Page"], inplace=True)
media_df.drop(columns=["OpenCritic URL","Description","Release Date","Review Title","Published Date","Review URL","Language"], inplace=True)
user_df.head()

Unnamed: 0,Game Name,Review,Rating
0,Farming Simulator 15,Did not make it to the stage in the game where...,70
1,Farming Simulator 15,got lost in the forest with a chainsaw and pre...,80
2,Farming Simulator 15,Screen went grey and nothing happened whenever...,10
3,Farming Simulator 15,"Farming Sim 15 is not a bad game, I just hate ...",50
4,Farming Simulator 15,I single out this Farming Simulator game for t...,80


In [9]:
# Data Cleaning -select common games
media_games = media_df["Game"].unique()
user_games = user_df["Game Name"].unique()
common_games = set(media_games) & set(user_games)
media_df_filtered = media_df[media_df["Game"].isin(user_games)]
media_df_filtered["Game"].nunique()
media_df_filtered.drop(columns=["Tier"], inplace=True)
print("Common games: ", len(common_games))
print("Media games: ", media_df_filtered["Game"].nunique())

Common games:  3732
Media games:  3732


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  media_df_filtered.drop(columns=["Tier"], inplace=True)


In [10]:
media_df_filtered.head(5)

Unnamed: 0,Game,Genres,Platforms,Reviewer (Outlet),Snippet,Score,Median Score,Top Critic Score,Percent Recommended
0,Farming Simulator 15,"Simulation, Vehicle Combat","PlayStation 4, Xbox One, PC, Xbox Series X/S, ...",Gaming Nexus,Farming Simulator 15 is a great addition to th...,85.0,60.0,57.142857,10.344828
1,Farming Simulator 15,"Simulation, Vehicle Combat","PlayStation 4, Xbox One, PC, Xbox Series X/S, ...",Game Debate,I really like Farming Simulator 2015. It's not...,85.0,60.0,57.142857,10.344828
2,Farming Simulator 15,"Simulation, Vehicle Combat","PlayStation 4, Xbox One, PC, Xbox Series X/S, ...",ZTGD,"Farming Simulator is king at what it does, so ...",80.0,60.0,57.142857,10.344828
3,Farming Simulator 15,"Simulation, Vehicle Combat","PlayStation 4, Xbox One, PC, Xbox Series X/S, ...",PlayStation LifeStyle,If you know you'd like a farming simulator gam...,70.0,60.0,57.142857,10.344828
4,Farming Simulator 15,"Simulation, Vehicle Combat","PlayStation 4, Xbox One, PC, Xbox Series X/S, ...",Digital Chumps,I'm not convinced that this game does not have...,70.0,60.0,57.142857,10.344828


# Remove short or meaningless user review (Media review is more formal that won't happen)

In [11]:
user_df_filtered = user_df[user_df["Review"].apply(is_meaningful_review)]

# Filtering based on review number

In [14]:
# Media reviews more than 50
n = 50
game_counts = media_df_filtered["Game"].value_counts()
games_with_n_or_more_reviews = game_counts[game_counts >= n].index
media_df_filtered_nplus = media_df_filtered[media_df_filtered["Game"].isin(games_with_n_or_more_reviews)]
media_df_filtered_nplus.nunique()

Game                     726
Genres                   158
Platforms                243
Reviewer (Outlet)        512
Snippet                55769
Score                     92
Median Score              56
Top Critic Score         721
Percent Recommended      639
dtype: int64

In [15]:
# User reviews more than 50
n = 100
game_counts = user_df_filtered["Game Name"].value_counts()
games_with_n_or_more_reviews = game_counts[game_counts >= n].index
user_df_filtered_nplus = user_df_filtered[user_df_filtered["Game Name"].isin(games_with_n_or_more_reviews)]
user_df_filtered_nplus.nunique()

Game Name       794
Review       365533
Rating           10
dtype: int64

In [16]:
media_games = media_df_filtered_nplus["Game"].unique()
user_games = user_df_filtered_nplus["Game Name"].unique()
common_games = set(media_games) & set(user_games)
print("Common games: ", len(common_games))

Common games:  456


In [17]:
media_games = media_df_filtered_nplus["Game"].unique()
user_games = user_df_filtered_nplus["Game Name"].unique()

media_df_filtered_nplus = media_df_filtered_nplus[media_df_filtered_nplus["Game"].isin(common_games)]
media_df_filtered_nplus["Game"].nunique()

user_df_filtered_nplus = user_df_filtered_nplus[user_df_filtered_nplus["Game Name"].isin(common_games)]
user_df_filtered_nplus["Game Name"].nunique()

print(media_df_filtered_nplus["Game"].nunique())
print(user_df_filtered_nplus["Game Name"].nunique())

456
456


In [18]:
media_df_filtered_nplus.reset_index(drop=True, inplace=True)
user_df_filtered_nplus.reset_index(drop=True, inplace=True)

In [20]:
print(media_df_filtered_nplus.shape)
print(user_df_filtered_nplus.shape)

(38342, 9)
(254399, 3)


In [19]:
media_df_filtered_nplus.to_csv("./media_df_filtered_nplus.csv")
user_df_filtered_nplus.to_csv("./user_df_filtered_nplus.csv")