# De-Duplicate Data

In [None]:
import pandas as pd
import os
from IPython.core.display import Markdown

from utilites import column_stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [None]:
class DataSource:
    def __init__(self, path):
        self._path = path
        self._df = pd.read_csv(path, low_memory=False)
        self._keys = []

    def head(self):
        return self._df.head()

    @property
    def df(self):
        return self._df.copy()

    @df.setter
    def df(self, df):
        self._df = df

    @property
    def keys(self):
        return self._keys

    @keys.setter
    def keys(self, *primary_keys):
        if isinstance(primary_keys[0], list):
            self._keys = primary_keys
        else:
            self._keys = [item for sublist in primary_keys for item in sublist]

data_sources = {}

### Movies Metadata

In [None]:
data_sources["Movies Metadata"] = DataSource("../data/movies_metadata.csv")
display(data_sources["Movies Metadata"].head())

In [None]:
data_sources["Movies Metadata"].keys = ["id"], ["imdb_id"], ["title", "release_date"]

### Rotten Tomatoes Movie Reviews

In [None]:
data_sources["Rotten Tomatoes Movie Reviews"] = DataSource("../data/rotten_tomatoes_movie_reviews.csv")
display(data_sources["Rotten Tomatoes Movie Reviews"].head())

In [None]:
data_sources["Rotten Tomatoes Movie Reviews"].keys = ["reviewId"]

### Rotten Tomatoes Movies

In [None]:
data_sources["Rotten Tomatoes Movies"] = DataSource("../data/rotten_tomatoes_movies.csv")
display(data_sources["Rotten Tomatoes Movies"].head())

In [None]:
data_sources["Rotten Tomatoes Movies"].keys = ["id"], ["title", "releaseDateTheaters"]

## Keywords

In [None]:
data_sources["Keywords"] = DataSource("../data/keywords.csv")
display(data_sources["Keywords"].head())

In [None]:
data_sources["Keywords"].keys = ["id"]

## Links

In [None]:
data_sources["Links"] = DataSource("../data/links.csv")
display(data_sources["Links"].head())

In [None]:
data_sources["Links"].keys = ["movieId"], ["imdbId"], ["tmdbId"]

## Ratings

In [None]:
data_sources["Ratings"] = DataSource("../data/ratings.csv")
display(data_sources["Ratings"].head())

In [None]:
data_sources["Ratings"].keys = ["userId", "movieId"]

## Wikipedia Movie Plots

In [None]:
data_sources["Wiki Movie Plots"] = DataSource("../data/wiki_movie_plots.csv")
display(data_sources["Wiki Movie Plots"].head())

In [None]:
data_sources["Wiki Movie Plots"].keys = ["Wiki Page"], ["Title", "Director"]

## Clean and Remove Duplicates in the Keys

In [None]:
for name, data_source in data_sources.items():
    display(Markdown(f"### {name} Key(s) Stats" ))
    display(column_stats(
        data_source.df[sum(data_source.keys, [])],
        metrics=["Data Type", "Count", "Unique Values", "Unique Rate"]))

    df = data_source.df
    duplicates = {}
    for key in data_source.keys:
        duplicates[str(key)] = df[df.duplicated(subset=key, keep=False)]
        duplicates[str(key)] = duplicates[str(key)].sort_values(by=key)
        df.drop_duplicates(subset=key, inplace=True)

    # Write-back cleaned df
    data_source.df = df

    display(Markdown(f"### {name} Key(s) Stats (Cleaned)"))
    display(column_stats(
        df[sum(data_source.keys, [])],
        metrics=["Data Type", "Count", "Unique Values", "Unique Rate"]))

    for key, duplicate in duplicates.items():
        display(Markdown(f"#### Duplicates of Key: {str(key)}"))
        display(duplicate)

In [None]:
for data_source in data_sources.values():
    file_name = os.path.basename(data_source._path)

    data_source.df.to_csv(f"./deduped_data/{file_name}", index=False)