In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('Data/games.csv')
df = pd.DataFrame(data)
print(df.columns.values)

['AppID' 'Name' 'Release date' 'Estimated owners' 'Peak CCU'
 'Required age' 'Price' 'DLC count' 'About the game' 'Supported languages'
 'Full audio languages' 'Reviews' 'Header image' 'Website' 'Support url'
 'Support email' 'Windows' 'Mac' 'Linux' 'Metacritic score'
 'Metacritic url' 'User score' 'Positive' 'Negative' 'Score rank'
 'Achievements' 'Recommendations' 'Notes' 'Average playtime forever'
 'Average playtime two weeks' 'Median playtime forever'
 'Median playtime two weeks' 'Developers' 'Publishers' 'Categories'
 'Genres' 'Tags' 'Screenshots' 'Movies']


In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

df['Review_pos'] = ''
df['Review_neu'] = ''
df['Review_neg'] = ''

for i in range(len(df['Reviews'])):
    if pd.isnull(df['Reviews'][i]):
        continue
    else:
        df.loc[i, 'Review_pos'] = sentiment.polarity_scores(df['Reviews'][i])["pos"]
        df.loc[i, 'Review_neu'] = sentiment.polarity_scores(df['Reviews'][i])["neu"]
        df.loc[i, 'Review_neg'] = sentiment.polarity_scores(df['Reviews'][i])["neg"]

In [4]:
df['Review_pos']

0         
1         
2         
3         
4         
        ..
71711     
71712     
71713     
71714     
71715     
Name: Review_pos, Length: 71716, dtype: object

In [5]:
df['Review_neu']

0         
1         
2         
3         
4         
        ..
71711     
71712     
71713     
71714     
71715     
Name: Review_neu, Length: 71716, dtype: object

In [6]:
df['Review_neg']

0         
1         
2         
3         
4         
        ..
71711     
71712     
71713     
71714     
71715     
Name: Review_neg, Length: 71716, dtype: object

In [7]:
df['Links'] = ''

for i in range(len(df['Name'])):
    temp = str(df['Name'][i]).replace(' ', '-')
    temp = 'https://www.metacritic.com/game/' + temp.lower() + '/'
    df.loc[i, 'Links'] = temp

df['Links']

0        https://www.metacritic.com/game/galactic-bowling/
1            https://www.metacritic.com/game/train-bandit/
2            https://www.metacritic.com/game/jolt-project/
3                https://www.metacritic.com/game/henosis™/
4        https://www.metacritic.com/game/two-weeks-in-p...
                               ...                        
71711                https://www.metacritic.com/game/sur5/
71712       https://www.metacritic.com/game/prison-life-2/
71713    https://www.metacritic.com/game/architecture-z...
71714    https://www.metacritic.com/game/girl's-way-to-...
71715        https://www.metacritic.com/game/hentai-ariel/
Name: Links, Length: 71716, dtype: object

In [8]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.metacritic.com/game/switch/pokemon-sword/user-reviews?page=0'
url = df['Links'][0]

user_agent = {'User-agent': 'Mozilla/5.0'}
response = requests.get(url, headers = user_agent)

soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
# soup
print(url)
spans = soup.find('div', class_='c-siteReviewScore_background c-siteReviewScore_background-critic_medium').find_all('span')
print(spans)

def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

print(find_between(str(spans[0]), ">", "</" ))

https://www.metacritic.com/game/galactic-bowling/
[<span data-v-4cdca868="">tbd</span>]
tbd


In [16]:
df['mscores'] = ''

for i in range(len(df['Links'])):
    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = requests.get(df['Links'][i], headers = user_agent)

    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        spans = soup.find('div', class_='c-siteReviewScore_background c-siteReviewScore_background-critic_medium').find_all('span')
    except:
        spans = ['>na</']
    df.loc[i, 'mscores'] = find_between(str(spans[0]), ">", "</" )

df['mscores']

KeyboardInterrupt: 

In [17]:
count = 0
for i in range(len(df['mscores'])):
    if df['mscores'][i] == '':
        count += 1 
print(count)

55696


In [6]:
review_dict = {'name':[], 'date':[], 'rating':[], 'review':[]}

for page in range(0,23): #Remember to update the number of pages 
    url = 'https://www.metacritic.com/game/switch/pokemon-sword/user-reviews?page='+str(page)
    user_agent = {'User-agent': 'Mozilla/5.0'}
    response  = requests.get(url, headers = user_agent)
    #time.sleep(rand.randint(3,30)) 
    soup = BeautifulSoup(response.text, 'html.parser')
    for review in soup.find_all('div', class_='review_content'):
        if review.find('div', class_='name') == None:
                       break 
        review_dict['name'].append(review.find('div', class_='name').find('a').text)
        review_dict['date'].append(review.find('div', class_='date').text)
        review_dict['rating'].append(review.find('div', class_='review_grade').find_all('div')[0].text)
        if review.find('span', class_='blurb blurb_expanded'):
            review_dict['review'].append(review.find('span', class_='blurb blurb_expanded').text)
        else:
            review_dict['review'].append(review.find('div', class_='review_body').find('span').text)

sword_reviews = pd.DataFrame(review_dict)  