# Metacritic Comments and Scores
This dataset is a web scrape of Metacritic user comments and reviews by Dahlia at kaggle.com. Metacritic is a review aggregator for movies, TV shows, music albums, and video games.

Source: https://www.kaggle.com/dahlia25/metacritic-video-game-comments, accessed on July 26th, 2020.

The web scraper can be found at https://github.com/dahlia25/game_recommender, and uses Selenium and BeautifulSoup.

The data range between 1998 and 2018.

## The Dataset
Number of Rows | Number of Columns | One Row Represents
------------- | ------------- | -------------
283,983 | 5 | One user review

In [None]:
#Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

try:
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
    print("Module 'wordcloud' is installed")
except ModuleNotFoundError:
    %pip install wordcloud
    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

try:
    import nltk
    print("Module 'nltk' is installed")
except ModuleNotFoundError:
    %pip install nltk
    import nltk
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.sentiment import SentimentIntensityAnalyzer

import re, string

import random

In [None]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", "", token) #Remove hyperlinks of http[s] variety
        token = re.sub("(@[A-Za-z0-9_]+)", "", token) #Remove Twitter @'s
        if tag.startswith("NN"):
            pos = wordnet.NOUN
        elif tag.startswith("VB"):
            pos = wordnet.VERB
        elif tag.startswith("JJ"):
            pos = wordnet.ADJ
        else:
            pos = wordnet.ADV

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

def is_positive(token):
    return sia.polarity_scores(token)["compound"] > 0


In [None]:
nltk.download("twitter_samples")
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
dfN64 = pd.read_csv("Datasets/n64.csv", index_col = 0)

In [None]:
stop_words = stopwords.words('english')

tokenizer = TweetTokenizer()

#n64Text = dfN64["Comment"].str.cat(sep = " ")
n64Comments = []
for comment in dfN64["Comment"]:
    n64Comments.append(tokenizer.tokenize((comment)))

#positive_tweets = twitter_samples.strings("positive_tweets.json")
#negative_tweets = twitter_samples.strings("negative_tweets.json")
#text = twitter_samples.strings("tweets.20150430-223406.json")

#positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
#negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

print(n64Comments[0])

In [None]:
print(pos_tag(n64Comments[0]))

In [None]:
cleanedN64Comments = []
for token in n64Comments:
    cleanedN64Comments.append(remove_noise(token, stop_words))
print(cleanedN64Comments[0])

In [None]:
allN64WordsGen = get_all_words(cleanedN64Comments)
allN64Words = []
for token in allN64WordsGen:
    allN64Words.append(token)
#print(allN64Words)

In [None]:
def get_tweets_for_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

n64ForModel = get_tweets_for_model(cleanedN64Comments)

sia = SentimentIntensityAnalyzer()

allN64WordsGen = get_all_words(cleanedN64Comments)
positiveN64 = [(token, "Positive") for token in allN64WordsGen if sia.polarity_scores(token)["compound"] > 0]

allN64WordsGen = get_all_words(cleanedN64Comments)
negativeN64 = [(token, "Negative") for token in allN64WordsGen if sia.polarity_scores(token)["compound"] <= 0]

allN64WordsGen = get_all_words(cleanedN64Comments)
neutralN64 = [(token, "Neutral") for token in allN64WordsGen if sia.polarity_scores(token)["compound"] == 0]



#print(neutralN64[0])

In [None]:
dataset = positiveN64 + negativeN64 + neutralN64

random.shuffle(dataset)

dataset.shape[0]

#train_data = dataset[:7000]
#test_data = dataset[7000:]

In [None]:
#positiveFreqDist = FreqDist(allN64Words)
#print(positiveFreqDist.most_common(10))

In [None]:
allN64WordsString = " ".join(allN64Words)
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(allN64WordsString)

In [None]:
positive_tokens_for_model = get_tweets_for_model(cleanedPositiveTweetsList)
negative_tokens_for_model = get_tweets_for_model(cleanedNegativeTweetsList)

positive_dataset = [(token, "Positive")
                     for token in positive_tokens_for_model]
print(positive_dataset[0])


negative_dataset = [(token, "Negative")
                     for token in negative_tokens_for_model]
print(negative_dataset[0])

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]


In [None]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

In [None]:
#df = pd.read_csv("D:/Git/large datasets/metacritic_game_user_comments.csv", index_col = 0)
#dfN64 = df[df["Platform"] == "Nintendo64"]
#dfN64.to_csv("D:/Git/jupyter-books/Datasets/n64.csv")

In [None]:
dfN64 = pd.read_csv("Datasets/n64.csv", index_col = 0)

In [None]:
dfN64.head()

In [None]:
n64Text = dfN64["Comment"].str.cat(sep = " ")

stopwords = set(STOPWORDS)
stopwords.update(["game", "play", "played", "to", "feel", "even", "want", "make", "made", "much", "still", "playing", "one", "games"])

cloud = WordCloud(width = 1200, height = 800, stopwords = stopwords, background_color = "white").generate(n64Text)

In [None]:
plt.figure(figsize = (19.2, 10.8))
plt.imshow(cloud, interpolation = "bilinear")
plt.axis("off")
plt.show()