# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re


In [4]:
# read data
negative_reviews = pd.read_csv('steam_negative_reviews-ENGLISH-preprocessed.csv')

In [6]:
# function developed by Rafał Wójcik from https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483
# with some minor changes

def clean_text(text):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"/", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    if text== " ":
        text= ""

    
    text = text.split()

    return text


In [7]:
negative_reviews['review'] = negative_reviews['review'].apply(lambda x: clean_text(x))

In [8]:
negative_reviews['review']

0          [they, certainly, dumbed, down, the, series, s...
1          [terribly, bugs, keeps, kicking, me, out, to, ...
2          [while, there, is, a, lot, of, content, the, g...
3          [badbadbadbadbadbadbadbadbadbadbadbadbadbadbad...
4          [after, serious, consideration, i, think, this...
                                 ...                        
1050988    [as, of, day, of, release, system, ryzen, sli,...
1050989           [game, crashes, to, desktop, cannot, play]
1050990    [im, gonna, be, clear, cant, say, that, its, a...
1050991    [as, a, half, life, game, i, dunno, it, just, ...
1050992    [no, proper, smooth, locomotion, its, a, game,...
Name: review, Length: 1050993, dtype: object

> we noticed that there some reviews had meaningless words due to some are expressing there opnion about the game by repeeting the a set of letters in the review...
for instatnce: badbadbadbadbadbadbadbadbadbadbadbadbadbadbad... 
so we decided to remove those patterns to avoid any sort of confusion

In [9]:
# dealing with words with too many characters
negative_reviews['review'] = negative_reviews['review'].apply(lambda x: [i if len(i)<27 else "" for i in x])

> Since the reviews are stored each in the form of a list we need to convert them back where every review is a whole string

In [11]:
negative_reviews['review'] = negative_reviews['review'].apply(lambda x: ' '.join(x))

In [12]:
negative_reviews['review']

0          they certainly dumbed down the series since th...
1          terribly bugs keeps kicking me out to home scr...
2          while there is a lot of content the game loves...
3                                                           
4          after serious consideration i think this game ...
                                 ...                        
1050988    as of day of release system ryzen sli gtx rift...
1050989                  game crashes to desktop cannot play
1050990    im gonna be clear cant say that its a bad game...
1050991    as a half life game i dunno it just doesn't fe...
1050992    no proper smooth locomotion its a game based a...
Name: review, Length: 1050993, dtype: object

In [13]:
negative_reviews

Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,votes_funny,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
0,292030,The Witcher 3: Wild Hunt,85155206,english,they certainly dumbed down the series since th...,1611337043,1611337043,False,0,0,...,True,False,False,76561198010955177,379,73,8966.0,0.0,8966.0,1.514417e+09
1,292030,The Witcher 3: Wild Hunt,85128129,english,terribly bugs keeps kicking me out to home scr...,1611299056,1611299056,False,0,0,...,True,False,False,76561198332696736,23,1,8565.0,4973.0,8442.0,1.611364e+09
2,292030,The Witcher 3: Wild Hunt,85109753,english,while there is a lot of content the game loves...,1611265004,1611265004,False,0,0,...,True,False,False,76561198098346837,507,47,2081.0,0.0,2081.0,1.606027e+09
3,292030,The Witcher 3: Wild Hunt,85088505,english,,1611235321,1611235321,False,1,0,...,True,False,False,76561199013170832,40,15,1079.0,0.0,1079.0,1.597714e+09
4,292030,The Witcher 3: Wild Hunt,85064415,english,after serious consideration i think this game ...,1611195190,1611195190,False,1,1,...,True,False,False,76561198085431842,206,65,2031.0,0.0,2031.0,1.594446e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050988,546560,Half-Life: Alyx,65647678,english,as of day of release system ryzen sli gtx rift...,1584989199,1584989199,False,8,6,...,True,False,False,76561198099785821,252,16,174.0,0.0,29.0,1.586467e+09
1050989,546560,Half-Life: Alyx,65647057,english,game crashes to desktop cannot play,1584988581,1584988581,False,7,4,...,True,False,False,76561198895222898,36,3,47.0,0.0,34.0,1.585013e+09
1050990,546560,Half-Life: Alyx,65647040,english,im gonna be clear cant say that its a bad game...,1584988565,1584988565,False,45,14,...,True,False,False,76561197984147634,327,2,39.0,0.0,39.0,1.584987e+09
1050991,546560,Half-Life: Alyx,65646598,english,as a half life game i dunno it just doesn't fe...,1584988151,1585135046,False,12,1,...,False,False,False,76561197960598489,490,86,1089.0,0.0,22.0,1.596740e+09


> We figured to fill the empty reviews (resulted in the cleaning process) with None values so later on we can easily drop them

In [14]:
negative_reviews["review"]= negative_reviews["review"].apply(lambda x: None if len(x)<=0 else x)

In [15]:
negative_reviews

Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,votes_funny,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
0,292030,The Witcher 3: Wild Hunt,85155206,english,they certainly dumbed down the series since th...,1611337043,1611337043,False,0,0,...,True,False,False,76561198010955177,379,73,8966.0,0.0,8966.0,1.514417e+09
1,292030,The Witcher 3: Wild Hunt,85128129,english,terribly bugs keeps kicking me out to home scr...,1611299056,1611299056,False,0,0,...,True,False,False,76561198332696736,23,1,8565.0,4973.0,8442.0,1.611364e+09
2,292030,The Witcher 3: Wild Hunt,85109753,english,while there is a lot of content the game loves...,1611265004,1611265004,False,0,0,...,True,False,False,76561198098346837,507,47,2081.0,0.0,2081.0,1.606027e+09
3,292030,The Witcher 3: Wild Hunt,85088505,english,,1611235321,1611235321,False,1,0,...,True,False,False,76561199013170832,40,15,1079.0,0.0,1079.0,1.597714e+09
4,292030,The Witcher 3: Wild Hunt,85064415,english,after serious consideration i think this game ...,1611195190,1611195190,False,1,1,...,True,False,False,76561198085431842,206,65,2031.0,0.0,2031.0,1.594446e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050988,546560,Half-Life: Alyx,65647678,english,as of day of release system ryzen sli gtx rift...,1584989199,1584989199,False,8,6,...,True,False,False,76561198099785821,252,16,174.0,0.0,29.0,1.586467e+09
1050989,546560,Half-Life: Alyx,65647057,english,game crashes to desktop cannot play,1584988581,1584988581,False,7,4,...,True,False,False,76561198895222898,36,3,47.0,0.0,34.0,1.585013e+09
1050990,546560,Half-Life: Alyx,65647040,english,im gonna be clear cant say that its a bad game...,1584988565,1584988565,False,45,14,...,True,False,False,76561197984147634,327,2,39.0,0.0,39.0,1.584987e+09
1050991,546560,Half-Life: Alyx,65646598,english,as a half life game i dunno it just doesn't fe...,1584988151,1585135046,False,12,1,...,False,False,False,76561197960598489,490,86,1089.0,0.0,22.0,1.596740e+09


In [16]:
negative_reviews= negative_reviews.dropna()

In [17]:
negative_reviews

Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,votes_funny,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
0,292030,The Witcher 3: Wild Hunt,85155206,english,they certainly dumbed down the series since th...,1611337043,1611337043,False,0,0,...,True,False,False,76561198010955177,379,73,8966.0,0.0,8966.0,1.514417e+09
1,292030,The Witcher 3: Wild Hunt,85128129,english,terribly bugs keeps kicking me out to home scr...,1611299056,1611299056,False,0,0,...,True,False,False,76561198332696736,23,1,8565.0,4973.0,8442.0,1.611364e+09
2,292030,The Witcher 3: Wild Hunt,85109753,english,while there is a lot of content the game loves...,1611265004,1611265004,False,0,0,...,True,False,False,76561198098346837,507,47,2081.0,0.0,2081.0,1.606027e+09
4,292030,The Witcher 3: Wild Hunt,85064415,english,after serious consideration i think this game ...,1611195190,1611195190,False,1,1,...,True,False,False,76561198085431842,206,65,2031.0,0.0,2031.0,1.594446e+09
5,292030,The Witcher 3: Wild Hunt,85044205,english,terrible map design and function meh combat an...,1611164410,1611164410,False,0,0,...,True,False,False,76561198341528559,48,1,861.0,861.0,861.0,1.611164e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050988,546560,Half-Life: Alyx,65647678,english,as of day of release system ryzen sli gtx rift...,1584989199,1584989199,False,8,6,...,True,False,False,76561198099785821,252,16,174.0,0.0,29.0,1.586467e+09
1050989,546560,Half-Life: Alyx,65647057,english,game crashes to desktop cannot play,1584988581,1584988581,False,7,4,...,True,False,False,76561198895222898,36,3,47.0,0.0,34.0,1.585013e+09
1050990,546560,Half-Life: Alyx,65647040,english,im gonna be clear cant say that its a bad game...,1584988565,1584988565,False,45,14,...,True,False,False,76561197984147634,327,2,39.0,0.0,39.0,1.584987e+09
1050991,546560,Half-Life: Alyx,65646598,english,as a half life game i dunno it just doesn't fe...,1584988151,1585135046,False,12,1,...,False,False,False,76561197960598489,490,86,1089.0,0.0,22.0,1.596740e+09


In [18]:
# Export to csv
negative_reviews.to_csv("steam_negative_reviews-ENGLISH-preprocessed.csv", index=False)

In [None]:
# pickle df
import pickle
pickle.dump(negative_reviews, open("negative_reviews.pickle"), "wb")