In [1]:
%matplotlib inline
import pandas as pd

In [2]:
from IPython.core.display import HTML
css = open('table.css').read() + open('notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [3]:
reviews = pd.read_csv("reviews.csv", encoding="utf-8")
reviews.head(5)

Unnamed: 0,userID,productID,userName,reviewText,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,700099867,123,Installing the game was a struggle (because of...,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012"
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""",If you like rally cars get this game you will ...,Good rally game,1372550400,"06 30, 2013"
2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""",1st shipment received a book instead of the ga...,Wrong key,1403913600,"06 28, 2014"
3,A1DLMTOTHQ4AST,700099867,ampgreen,"I got this version instead of the PS3 version,...","awesome game, if it did not crash frequently !!",1315958400,"09 14, 2011"
4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""",I had Dirt 2 on Xbox 360 and it was an okay ga...,DIRT 3,1308009600,"06 14, 2011"


#### How many reviews are there?

In [4]:
len(reviews)

231780

#### Who has the most review data?

In [5]:
reviews[reviews["userID"] == reviews["userID"].value_counts().idxmax()].userName.head(1)

1400    Lisa Shea "be the change you wish to see in t...
Name: userName, dtype: object

#### Convert unixReviewTime field to date field and add it dataframe with column name date

In [6]:
reviews["date"] = pd.to_datetime(reviews["unixReviewTime"], unit='s')

#### What is the very first reviewText?

In [7]:
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["reviewText","date"]]

Unnamed: 0,reviewText,date
4410,I'm having the most fun I've ever had on PlayS...,1999-10-14


#### What is the name of person that made first review?

In [8]:
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["userName","date"]]

Unnamed: 0,userName,date
4410,"""kwobooks""",1999-10-14


#### What is the id of video games that is the last reviewed?

In [9]:
reviews.sort_values(by="unixReviewTime", ascending=False).head(1)[["productID","date"]]

Unnamed: 0,productID,date
218837,B00BMFIXT2,2014-07-22


#### Make summaries lowercase and remove punctuations

In [10]:
import string
reviews["summary"] = reviews["summary"].str.lower()
filter_punctuations = lambda summary : "".join(list(filter(lambda x:x not in string.punctuation, str(summary))))
reviews["summary"] = reviews["summary"].apply(filter_punctuations)

#### What is the most occurrent summary in all data?

In [11]:
reviews["summary"].value_counts().nlargest(10)

great game      3749
awesome         1402
great           1302
good game       1259
love it          930
awesome game     796
fun game         766
fun              760
amazing          608
good             604
Name: summary, dtype: int64

#### What is the most occurrent summary in 2011?

In [12]:
reviews[reviews.date.dt.year == 2011]["summary"].value_counts().nlargest(10)

great game      316
awesome         102
great            83
good game        82
amazing          67
fun game         67
awesome game     67
love it          63
fun              63
wow              36
Name: summary, dtype: int64

#### What is the most occurent word in summaries in 2000?

In [13]:
from collections import Counter
all_counter = Counter(reviews[reviews.date.dt.year == 2000]["summary"].str.split(" ").sum())
all_counter.most_common(10)

[('the', 413),
 ('game', 403),
 ('a', 356),
 ('great', 201),
 ('of', 173),
 ('best', 151),
 ('for', 140),
 ('good', 140),
 ('but', 137),
 ('to', 129)]

#### What is the most occurent word in reviewTexts in 2000?

In [14]:
from collections import Counter
counter = Counter(reviews[reviews.date.dt.year == 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)

[('', 18114),
 ('the', 17314),
 ('and', 9467),
 ('a', 9045),
 ('to', 8865),
 ('of', 7549),
 ('is', 6833),
 ('you', 6007),
 ('game', 4659),
 ('I', 4492)]

#### What is the most occurent word in reviewTexts before 2000?

In [15]:
from collections import Counter
counter = Counter(reviews[reviews.date.dt.year < 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)

[('', 561),
 ('the', 380),
 ('and', 220),
 ('a', 211),
 ('to', 193),
 ('is', 156),
 ('of', 149),
 ('you', 133),
 ('I', 104),
 ('this', 97)]