In [1]:
# Dependencies and setup
import pandas as pd

In [2]:
# Create path to JSON and read it into a Pandas DataFrame
json_path = "Resources/IMDB_reviews.json"
imdb_reviews_df = pd.read_json(json_path, lines = True)
imdb_reviews_df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [3]:
# Cleanse DataFrame of any rows with null values and duplicated rows; reset index
imdb_reviews_df.dropna(how = "all")
imdb_reviews_df.drop_duplicates(keep = "first", inplace = True)
imdb_reviews_df.reset_index(drop = True, inplace = True)
imdb_reviews_df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [4]:
# Delete unecessary columns from DataFrame; re-order columns
imdb_reviews_df = imdb_reviews_df.drop(["review_date", "user_id", "review_text", "review_summary"], axis = 1)
imdb_reviews_df = imdb_reviews_df[["movie_id", "rating", "is_spoiler"]]
imdb_reviews_df.head()

Unnamed: 0,movie_id,rating,is_spoiler
0,tt0111161,10,True
1,tt0111161,10,True
2,tt0111161,8,True
3,tt0111161,10,True
4,tt0111161,8,True


In [5]:
# Aggregate DataFrame at the movie_id level
imdb_spoilers = imdb_reviews_df.groupby(["movie_id"])
average_rating = imdb_spoilers["rating"].mean()
no_of_reviews = imdb_spoilers["is_spoiler"].count()
no_of_spoilers = imdb_reviews_df[imdb_reviews_df["is_spoiler"] == True].groupby("movie_id")["is_spoiler"].count()

# Create summary table for aggregated data
imdb_spoilers_df = pd.DataFrame({
    "Average_Rating": average_rating,
    "Number_of_Reviews": no_of_reviews,
    "Number_of_Spoilers": no_of_spoilers
})
imdb_spoilers_df = imdb_spoilers_df.sort_values("Number_of_Spoilers", ascending = False)
imdb_spoilers_df["Number_of_Spoilers"] = pd.to_numeric(imdb_spoilers_df["Number_of_Spoilers"], errors = "coerce").fillna(0).astype(int)
imdb_spoilers_df.head()

Unnamed: 0_level_0,Average_Rating,Number_of_Reviews,Number_of_Spoilers
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0468569,8.455108,4845,1482
tt0111161,9.297867,4361,956
tt0167260,8.823745,2729,490
tt2488496,2.592693,739,482
tt0137523,8.645565,2480,459


In [6]:
# Save cleaned DataFrame to a CSV file
imdb_spoilers_df.to_csv("Output/IMDB_spoilers_clean.csv", index=True, header=True)