In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import numpy as np

### XHR request

The steam review [page](https://steamcommunity.com/app/275850/reviews/?browsefilter=toprated&snr=1_5_100010_) is dynamic. It loads 10 more user reviews each time scrolling down.

By inspecting the __XHR(XMLHttpRequest)__ request, I can catch the patterns in the change of parameters, then use BeautifulSoup to scrap the needed information.

In [None]:
review_df=pd.DataFrame()

for i in range(1,7953):
    url='https://steamcommunity.com/app/275850/homecontent'
    response=requests.get(url,params={
        'userreviewsoffset': (i-1)*10,
        'p': i,
        'workshopitemspage': i,
        'readytouseitemspage': i,
        'mtxitemspage': i,
        'itemspage': i,
        'screenshotspage': i,
        'videospage': i,
        'artpage': i,
        'allguidepage': i,
        'webguidepage': i,
        'integeratedguidepage': i,
        'discussionspage': i,
        'numperpage': 10,
        'browsefilter': 'toprated',
        'browsefilter': 'toprated',
        'appid': 275850,
        'appHubSubSection': 10,
        'l': 'english',
        'filterLanguage': 'default',
        'searchText': '',
        'forceanon':1
    })
    url = response.url
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    reviews=soup.find_all('div', {'class': 'apphub_Card'})
#     for review in reviews:
#         user = review.find('div', {'class': 'apphub_CardContentAuthorName'})
#         title = review.find('div', {'class': 'title'}).text
#         hour = review.find('div', {'class': 'hours'}).text.split(' ')[0]
#         user_link = user.find('a').attrs['href']
        
#         s=review.find('div', {'class': 'apphub_CardTextContent'})
#         post_date=s.find('div').get_text(strip=True)
#         s.find('div').decompose()
#         comment = s.get_text(strip=True)
        
    user = [review.find('div', {'class': 'apphub_CardContentAuthorName'}) for review in reviews]
    title = [review.find('div', {'class': 'title'}).text for review in reviews]
    hour = [np.nan if review.find('div', {'class': 'hours'})==None else 
            review.find('div', {'class': 'hours'}).text.split(' ')[0] for review in reviews]
    user_link = [x.find('a').attrs['href'] for x in user]
    helpful = [review.find('div',{'class': 'found_helpful'}).get_text(strip=True).split(' ')[0] for review in reviews]
        
    comment_date = [review.find('div', {'class': 'apphub_CardTextContent'}) for review in reviews]
    post_date = [x.find('div').get_text(strip=True) for x in comment_date]
    [x.find('div').decompose() for x in comment_date]                    #delete date from comment section
    comment = [x.get_text('\n',strip=True) for x in comment_date]
    
    
    df=pd.DataFrame({
        'user': [x.text for x in user],
        'playtime': hour,
        'user_link': user_link,
        'post_date': post_date,
#         'post_date': [' '.join(x.split(' ')[1:]) for x in post_date],
        'helpfulness': [0 if num == 'No' else num for num in helpful],
        'review': comment,
        'recommend': title
    })
    review_df=review_df.append(df,ignore_index=True)

In [None]:
review_df.to_csv('No Mans Sky.csv',index=False)

### Another try: Steam API

#### This does not work properly. It works for only the first hundres reviews, and return empty content for the remaining.

In [None]:
def reviews_search(gameid,start_offset,filt='all',language='en',review_type='all',purchase_type='all',num_per_page=20):
    url = 'https://store.steampowered.com/appreviews/'+str(gameid)+'?json=1'

    response = requests.get(url, params = {
        'filter': filt,
        'start_offset': start_offset,
        'language': language,
        'review_type': review_type,
        'purchase_type': purchase_type,
        'num_per_page': num_per_page
    })
    response.raise_for_status() # check for errors

    return response.json() # parse JSON

In [None]:
num=0
max_num=80000

review_df=pd.DataFrame()
while num<max_num:
    result=reviews_search(275850,num)['reviews']
    review=[x['review'] for x in result]
    date=[x['timestamp_created'] for x in result]
    recommend=[x['voted_up'] for x in result]
    helpful=[x['votes_up'] for x in result]
    playtime=[x['author']['playtime_forever'] for x in result]
    df=pd.DataFrame({'review': review,
                     'date': date,
                     'helpfulness': helpful,
                     'playtime': playtime,
                     'recommend': recommend})
    review_df=review_df.append(df,ignore_index=True)
    num+=20