# Exploratory Data Analysis

In [None]:
import pandas as pd


## Rotten Tomatoes Movies

In [None]:
df = pd.read_csv("deduped_data/rotten_tomatoes_movies.csv", low_memory=False)

display(df.head())

## Columns

### id
UUID, we don't need it

### title
UUID, but we need it to join other datasets so it must exist

### audienceScore, tomatoMeter, rating
Post release metric, we can't use it directly as a feature, but we can calculate averages for genre, director, lead actor etc.

### rating
G, PG, R, PG-13, etc. need to onehot encode

### ratingContents
why a certain rating was made, we could encode features with this

### releaseDateTheater
We can't use this directly, but we should use it to adjust for inflation / movie market growth

### releaseDateStreaming
drop

### genre
Contains, multiple genres, need to split and onehot encode

### originalLanguage
encode

### director
Probably won't use directly but we can compute metrics based on director like average box office, budget, rating, reviews etc

### writer
Same as director, probably less important

### boxOffice
Our target, this value needs to exist and we need to adjust for inflation

### distributor
Paramount, 20th century, etc. We can encode and maybe compute avg values

### soundMix
Surrond, Dolby, etc. Encode or drop

In [None]:
df.drop(["id", "releaseDateStreaming"], axis=1, inplace=True)
df.dropna(subset=["title"])

df.to_csv("merged_data/rotten_tomatoes_movies.csv", index=False)

## Movies Metadata

In [None]:
df = pd.read_csv("deduped_data/movies_metadata.csv", low_memory=False)

display(df.head())

## Columns

### adult
Is it an adult film? Probably drop trues

### belongs to collection
Is it a sequel or in a series. Encode and or compute avgs

### budget
Film budget

### genre
genres as list of json objects (dicts)

### homepage
website, drop

### id
UUID for the movie, used to join keywords, and links

### imdb_id
UUID, drop unless we use it to call imdb api for more data

### original_language
language abbreviation

### original_title
drop

### overview
drop

### popularity
not clear what it means, drop

### poster_path
drop

### production companies
all companies as list of json objects

### production_countries
drop

### release_date
need to inflation adjust box_office

### revenue
box office revenue

### runtime
keep, maybe drop non feature films or encode

### status
drop unreleased films

### tagline
drop

### title
UUID, use to join with other datasets

### video
drop

### vote_average
drop

### vote_count
drop

In [None]:
df = df[df["adult"] == "False"]
df = df[df["status"] == "Released"]

df.drop(["adult", "status", "homepage", "imdb_id", "original_title", "overview", "popularity", "poster_path", "production_countries", "tagline", "video", "vote_average", "vote_count"], axis=1, inplace=True)
df.dropna(subset=["title"])

df.to_csv("merged_data/movies_metadata.csv", index=False)

## Wiki Plots

In [None]:
df = pd.read_csv("deduped_data/wiki_movie_plots.csv", low_memory=False)

display(df.head())

## Columns

### Release Year
Keep for inflation and joining

### Title
Keep for joining

### Origin/Ethnicity
Drop

### Director
keep for avg metrics

### Cast
keep and compute metrics

### Genre
keep and compute metrics

### Wiki Page
Keep and use for more data mining

### Plot
Keep for text mining

In [None]:
df.drop(["Origin/Ethnicity"], axis=1, inplace=True)
df.dropna(subset=["Title"])

df.to_csv("merged_data/wiki_movie_plots.csv", index=False)