# De-Duplicate Data

In [100]:
import pandas as pd
import os
from IPython.core.display import Markdown

from utilites import column_stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [101]:
class DataSource:
    def __init__(self, path):
        self._path = path
        self._df = pd.read_csv(path, low_memory=False)
        self._keys = []

    def head(self):
        return self._df.head()

    @property
    def df(self):
        return self._df.copy()

    @df.setter
    def df(self, df):
        self._df = df

    @property
    def keys(self):
        return self._keys

    @keys.setter
    def keys(self, *primary_keys):
        if isinstance(primary_keys[0], list):
            self._keys = primary_keys
        else:
            self._keys = [item for sublist in primary_keys for item in sublist]

data_sources = {}

### Movies Metadata

In [102]:
data_sources["Movies Metadata"] = DataSource("../data/movies_metadata.csv")
display(data_sources["Movies Metadata"].head())

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [103]:
data_sources["Movies Metadata"].keys = ["id"], ["imdb_id"], ["title", "release_date"]

### Rotten Tomatoes Movie Reviews

In [104]:
data_sources["Rotten Tomatoes Movie Reviews"] = DataSource("../data/rotten_tomatoes_movie_reviews.csv")
display(data_sources["Rotten Tomatoes Movie Reviews"].head())

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
0,beavers,1145982,2003-05-23,Ivan M. Lincoln,False,3.5/4,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,http://www.deseretnews.com/article/700003233/B...
1,blood_mask,1636744,2007-06-02,The Foywonder,False,1/5,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,http://www.dreadcentral.com/index.php?name=Rev...
2,city_hunter_shinjuku_private_eyes,2590987,2019-05-28,Reuben Baron,False,,fresh,CBR,The choreography is so precise and lifelike at...,POSITIVE,https://www.cbr.com/city-hunter-shinjuku-priva...
3,city_hunter_shinjuku_private_eyes,2558908,2019-02-14,Matt Schley,False,2.5/5,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,https://www.japantimes.co.jp/culture/2019/02/0...
4,dangerous_men_2015,2504681,2018-08-29,Pat Padua,False,,fresh,DCist,Its clumsy determination is endearing and some...,POSITIVE,http://dcist.com/2015/11/out_of_frame_dangerou...


In [105]:
data_sources["Rotten Tomatoes Movie Reviews"].keys = ["reviewId"]

### Rotten Tomatoes Movies

In [106]:
data_sources["Rotten Tomatoes Movies"] = DataSource("../data/rotten_tomatoes_movies.csv")
display(data_sources["Rotten Tomatoes Movies"].head())

Unnamed: 0,id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
0,space-zombie-bingo,Space Zombie Bingo!,50.0,,,,,2018-08-25,75.0,"Comedy, Horror, Sci-fi",English,George Ormrod,"George Ormrod,John Sabotta",,,
1,the_green_grass,The Green Grass,,,,,,2020-02-11,114.0,Drama,English,Tiffany Edwards,Tiffany Edwards,,,
2,love_lies,"Love, Lies",43.0,,,,,,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",,,
3,the_sore_losers_1997,Sore Losers,60.0,,,,,2020-10-23,90.0,"Action, Mystery & thriller",English,John Michael McCarthy,John Michael McCarthy,,,
4,dinosaur_island_2002,Dinosaur Island,70.0,,,,,2017-03-27,80.0,"Fantasy, Adventure, Animation",English,Will Meugniot,John Loy,,,


In [107]:
data_sources["Rotten Tomatoes Movies"].keys = ["id"], ["title"]

## Keywords

In [108]:
data_sources["Keywords"] = DataSource("../data/keywords.csv")
display(data_sources["Keywords"].head())

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [109]:
data_sources["Keywords"].keys = ["id"]

## Links

In [110]:
data_sources["Links"] = DataSource("../data/links.csv")
display(data_sources["Links"].head())

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [111]:
data_sources["Links"].keys = ["movieId"], ["imdbId"], ["tmdbId"]

## Ratings

In [112]:
data_sources["Ratings"] = DataSource("../data/ratings.csv")
display(data_sources["Ratings"].head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [113]:
data_sources["Ratings"].keys = ["userId", "movieId"]

In [114]:
data_sources["Wiki Movie Plots"] = DataSource("../data/wiki_movie_plots.csv")
display(data_sources["Wiki Movie Plots"].head())

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [115]:
data_sources["Wiki Movie Plots"].keys = ["Wiki Page"]

## Clean and Remove Duplicates in the Keys

In [116]:
for name, data_source in data_sources.items():
    display(Markdown(f"### {name} Key(s) Stats" ))
    display(column_stats(
        data_source.df[sum(data_source.keys, [])],
        metrics=["Data Type", "Count", "Unique Values", "Unique Rate"]))

    df = data_source.df
    duplicates = {}
    for key in data_source.keys:
        duplicates[str(key)] = df[df.duplicated(subset=key, keep=False)]
        duplicates[str(key)] = duplicates[str(key)].sort_values(by=key)
        df.drop_duplicates(subset=key, inplace=True)

    # Write-back cleaned df
    data_source.df = df

    display(Markdown(f"### {name} Key(s) Stats (Cleaned)"))
    display(column_stats(
        df[sum(data_source.keys, [])],
        metrics=["Data Type", "Count", "Unique Values", "Unique Rate"]))

    for key, duplicate in duplicates.items():
        display(Markdown(f"#### Duplicates of Key: {str(key)}"))
        display(duplicate)

### Movies Metadata Key(s) Stats

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
id,object,45466,45436,99.934017
imdb_id,object,45449,45417,99.892227
title,object,45460,42277,92.985968
release_date,object,45379,17336,38.129591


### Movies Metadata Key(s) Stats (Cleaned)

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
id,object,45416,45416,100.0
imdb_id,object,45415,45415,99.997798
title,object,45414,42264,93.059715
release_date,object,45334,17334,38.167166


#### Duplicates of Key: ['id']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
676,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,105045,tt0111613,de,Das Versprechen,"East-Berlin, 1961, shortly after the erection ...",0.122178,/5WFIrBhOOgc0jGmoLxMZwWqCctO.jpg,"[{'name': 'Studio Babelsberg', 'id': 264}, {'n...","[{'iso_3166_1': 'DE', 'name': 'Germany'}]",1995-02-16,0.0,115.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,"A love, a hope, a wall.",The Promise,False,5.0,1.0
1465,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,105045,tt0111613,de,Das Versprechen,"East-Berlin, 1961, shortly after the erection ...",0.122178,/5WFIrBhOOgc0jGmoLxMZwWqCctO.jpg,"[{'name': 'Studio Babelsberg', 'id': 264}, {'n...","[{'iso_3166_1': 'DE', 'name': 'Germany'}]",1995-02-16,0.0,115.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,"A love, a hope, a wall.",The Promise,False,5.0,1.0
44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,6.480376,/5ILjS6XB5deiHop8SXPsYxXWVPE.jpg,"[{'name': 'TV Tokyo', 'id': 3034}, {'name': '4...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2000-07-08,68411275.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0
4114,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,10.264597,/5ILjS6XB5deiHop8SXPsYxXWVPE.jpg,"[{'name': 'TV Tokyo', 'id': 3034}, {'name': '4...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2000-07-08,68411275.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,143.0
5710,False,,0,"[{'id': 18, 'name': 'Drama'}]",,109962,tt0082992,en,Rich and Famous,Two literary women compete for 20 years: one w...,12.180836,/tOflyY8eUFWubLKJH7fKg4KwpCl.jpg,"[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8...","[{'iso_3166_1': 'US', 'name': 'United States o...",1981-09-23,0.0,115.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"From the very beginning, they knew they'd be f...",Rich and Famous,False,4.9,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21116,False,,0,"[{'id': 99, 'name': 'Documentary'}]",,84198,tt1736049,en,A Place at the Table,"Using personal stories, this powerful document...",1.673307,/jn8L1QdWWX5c0NUOLjzaSXtZrbt.jpg,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-03-22,0.0,84.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,One Nation. Underfed.,A Place at the Table,False,6.9,7.0
13946,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,97995,tt0127834,en,Seven Years Bad Luck,"After breaking a mirror in his home, superstit...",0.141558,/4J6Ai4C5YRgfRUTlirrJ7QsmJKU.jpg,"[{'name': 'Max Linder Productions', 'id': 38162}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1921-02-06,0.0,62.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Seven Years Bad Luck,False,5.6,4.0
40276,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,97995,tt0127834,en,Seven Years Bad Luck,"After breaking a mirror in his home, superstit...",0.141558,/4J6Ai4C5YRgfRUTlirrJ7QsmJKU.jpg,"[{'name': 'Max Linder Productions', 'id': 38162}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1921-02-06,0.0,62.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Seven Years Bad Luck,False,5.6,4.0
38871,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,99080,tt0022537,en,The Viking,"Originally called White Thunder, American prod...",0.002362,/qenjwRvW9itR5pVp4CBkYfhVAOp.jpg,[],[],1931-06-21,0.0,70.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Actually produced during the Great Newfoundlan...,The Viking,False,0.0,0.0


#### Duplicates of Key: ['imdb_id']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Midnight Man,False,6.0,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0.0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Mardock Scramble: The Third Exhaust,False,7.0,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0.0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Beware Of Frost Bites,Avalanche Sharks,False,4.3,22,,,,,,,,,
8966,False,,1000000,"[{'id': 80, 'name': 'Crime'}]",,36337,,en,Delusion,"In this fast-paced, noirish road movie, a comp...",0.156722,/g2o1J0ulttuwovqLc0ho910MR95.jpg,"[{'name': 'Cineville', 'id': 2832}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1991-06-07,0.0,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,She only wanted love. But money's better than ...,Delusion,False,4.8,3.0
13757,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam...",,200796,,pl,Show,,0.149818,/jJqBwcV1rG3IfbxPA7dk8BgtZR3.jpg,[],[],2003-03-19,0.0,,[],Released,,Show,False,6.3,2.0
13821,False,"{'id': 75014, 'name': 'How I Unleashed World W...",0,"[{'id': 10769, 'name': 'Foreign'}, {'id': 28, ...",,75015,,pl,Jak rozpętałem drugą wojnę światową: Cz.3 - Wś...,How I Unleashed World War II tells the story o...,0.202468,/1nkuFJmr6FZ510hTurudbmKjLQO.jpg,[],"[{'iso_3166_1': 'PL', 'name': 'Poland'}]",1970-04-06,0.0,73.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,,How I Unleashed World War II Part III: Among F...,False,7.0,3.0
17382,False,,2500000,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",http://www.delawarepictures.net/dreamkiller/,36663,,en,Dreamkiller,"A team of doctors experiment with a new, highl...",0.035294,,[],[],,0.0,110.0,[],Released,Fear is the greatest killer.,Dreamkiller,False,5.0,1.0
17510,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,47116,,en,Víťaz,Tou former boxers meet in the ring again after...,0.0,/yMCadbpPKxmYdkLdVpvsH0tPIwq.jpg,[],[],1979-03-09,0.0,78.0,[],Rumored,,The Winner,False,0.0,0.0
18959,False,,0,"[{'id': 99, 'name': 'Documentary'}, {'id': 16,...",,28500,,en,Before The Dinosaurs - Walking With Monsters,Many people think of the dinosaurs as the firs...,1.556352,/wCbL4IKeKmuMQq1RFjXtp1H4QeD.jpg,"[{'name': 'BBC', 'id': 5996}]","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]",2005-11-05,0.0,87.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Before The Dinosaurs - Walking With Monsters,False,7.3,12.0
19322,False,,0,[],,118013,,en,Endeavour,Shaun Evans (The Take) steps into John Thaw's ...,1.233673,/aGwgeCPl9QIlcUhPHzCjm6dhprr.jpg,[],[],,0.0,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Rookie detective Endeavor Morse faces his firs...,Endeavour,False,6.6,19.0


#### Duplicates of Key: ['title', 'release_date']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19729,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,82663,tt0113002,en,Midnight Man,British soldiers force a recently captured IRA...,,,,,,,,,,,,,,
29502,False,"{'id': 122661, 'name': 'Mardock Scramble Colle...",0,"[{'id': 16, 'name': 'Animation'}, {'id': 878, ...",http://m-scramble.jp/exhaust/,122662,tt2423504,ja,マルドゥック・スクランブル 排気,Third film of the Mardock Scramble series.,,,,,,,,,,,,,,
35586,False,,0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 28,...",,249260,tt2622826,en,Avalanche Sharks,A group of skiers are terrorized during spring...,,,,,,,,,,,,,,


### Rotten Tomatoes Movie Reviews Key(s) Stats

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
reviewId,int64,1444963,1432569,99.142262


### Rotten Tomatoes Movie Reviews Key(s) Stats (Cleaned)

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
reviewId,int64,1432569,1432569,100.0


#### Duplicates of Key: ['reviewId']

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
1304101,full_moon_in_paris,7726,2000-01-01,Dave Kehr,True,,fresh,Chicago Reader,"Full Moon in Paris, the fourth in the series, ...",POSITIVE,http://onfilm.chicagoreader.com/movies/capsule...
1298252,full_moon_in_paris,7726,2000-01-01,Dave Kehr,True,,fresh,Chicago Reader,"Full Moon in Paris, the fourth in the series, ...",POSITIVE,http://onfilm.chicagoreader.com/movies/capsule...
94433,gigi,7977,2000-01-01,MaryAnn Johanson,False,9/10,fresh,Flick Filosopher,A charming delight.,POSITIVE,http://www.flickfilosopher.com/oscars/bestpix/...
94761,gigi,7977,2000-01-01,MaryAnn Johanson,False,9/10,fresh,Flick Filosopher,A charming delight.,POSITIVE,http://www.flickfilosopher.com/oscars/bestpix/...
1304838,heartburn,9051,2000-01-01,Roger Ebert,True,2/4,rotten,Chicago Sun-Times,"Here and there, we see glimmers of the greatne...",NEGATIVE,https://www.rogerebert.com/reviews/heartburn-1986
...,...,...,...,...,...,...,...,...,...,...,...
498398,catch_me_if_you_can,102796080,2023-04-08,Sean Axmaker,False,,fresh,Stream on Demand,Spielberg beautifully creates the culture of t...,POSITIVE,https://streamondemandathome.com/steven-spielb...
1295783,cherry_2022,102796110,2023-04-08,Dennis Schwartz,False,B+,fresh,Dennis Schwartz Movie Reviews,Pro-abortion rights film&#46;,POSITIVE,https://dennisschwartzreviews.com/cherry-2/
1307445,cherry_2022,102796110,2023-04-08,Dennis Schwartz,False,B+,fresh,Dennis Schwartz Movie Reviews,Pro-abortion rights film&#46;,POSITIVE,https://dennisschwartzreviews.com/cherry-2/
1304106,astrakan,102796136,2023-04-08,Michael Sicinski,False,,fresh,In Review Online,Withholding key information and avoiding expos...,POSITIVE,https://inreviewonline.com/astrakan


### Rotten Tomatoes Movies Key(s) Stats

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
id,object,143258,142052,99.158162
title,object,142891,126403,88.234514


### Rotten Tomatoes Movies Key(s) Stats (Cleaned)

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
id,object,126404,126404,100.0
title,object,126403,126403,99.999209


#### Duplicates of Key: ['id']

Unnamed: 0,id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
134663,$5_a_day,Five Dollars a Day,49.0,,PG-13,"['Sexual Content', 'Language', 'Brief Nudity']",,2010-08-24,98.0,Comedy,English,Nigel Cole,"Neal H. Dobrofsky,Tippi Dobrofsky",,,
127459,$5_a_day,Five Dollars a Day,49.0,,PG-13,"['Sexual Content', 'Language', 'Brief Nudity']",,2010-08-24,98.0,Comedy,English,Nigel Cole,"Neal H. Dobrofsky,Tippi Dobrofsky",,,
129415,0s_and_1s,0s & 1s,,,,,,2017-03-11,83.0,Comedy,English,Eugene Kotlyarenko,"Eugene Kotlyarenko,Morgan Krantz,Andrew Schwar...",,,
130221,0s_and_1s,0s & 1s,,,,,,2017-03-11,83.0,Comedy,English,Eugene Kotlyarenko,"Eugene Kotlyarenko,Morgan Krantz,Andrew Schwar...",,,
129350,10004207-grand_slam,Grand Slam,29.0,,,,,,67.0,Comedy,English,William Dieterle,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129438,zardoz,Zardoz,53.0,47.0,,,,2002-12-17,104.0,Sci-fi,English (United Kingdom),John Boorman,John Boorman,,,
129235,zavallilar,The Poor,,,,,,,72.0,Drama,Turkish,"Yilmaz Guney,Atif Yilmaz","Yilmaz Guney,Atif Yilmaz",,,
129745,zavallilar,The Poor,,,,,,,72.0,Drama,Turkish,"Yilmaz Guney,Atif Yilmaz","Yilmaz Guney,Atif Yilmaz",,,
9863,zombie_massacre_army_of_the_dead,Zombie Massacre: Army of the Dead,20.0,,,,,2016-10-24,83.0,Horror,English,Gary Ugarek,,,,


#### Duplicates of Key: ['title']

Unnamed: 0,id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
83053,til_death_do_us_part_2003,'Til Death Do Us Part,,,,,,,100.0,"Drama, Comedy",French (Canada),Robert Guédiguian,"Jean-Louis Milesi,Robert Guédiguian",,,
73023,til-death-do-us-part2008,'Til Death Do Us Part,60.0,,,,,,90.0,Documentary,English,Vita Lusty,,,,
80745,straight_on_till_morning,'Til Death Do Us Part,43.0,,,,,,83.0,Horror,English,Vicente Aranda,,,,
118377,til_death_do_us_part,'Til Death Do Us Part,57.0,,PG-13,"['Thematic Elements', 'Domestic Abuse', 'Some ...",2017-09-29,2018-08-01,100.0,Mystery & thriller,English,Chris Stokes,"Chris Stokes,Marques Houston",$3.5M,Novus Content,
136892,twas_the_night_before_christmas_1974,'Twas the Night Before Christmas,65.0,71.0,,,,,23.0,"Animation, Fantasy, Holiday, Kids & family",English,"Jules Bass,Arthur Rankin Jr.",Jerome Coopersmith,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136493,nordstrand,,,,,,,,89.0,Drama,German,Florian Eichinger,Florian Eichinger,,,
137235,quarry,,,,,,,,,,,Greg Chwerchak,,,,
137815,freda,,,,,,,,,,,,,,,
140080,taxi_ballad,,,,,,,,,,,,,,,


### Keywords Key(s) Stats

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
id,int64,46419,45432,97.873716


### Keywords Key(s) Stats (Cleaned)

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
id,int64,45432,45432,100.0


#### Duplicates of Key: ['id']

Unnamed: 0,id,keywords
37095,1998,"[{'id': 417, 'name': 'corruption'}, {'id': 612..."
36138,1998,"[{'id': 417, 'name': 'corruption'}, {'id': 612..."
36822,3025,"[{'id': 212, 'name': 'london england'}, {'id':..."
35865,3025,"[{'id': 212, 'name': 'london england'}, {'id':..."
35999,3692,"[{'id': 470, 'name': 'spy'}, {'id': 591, 'name..."
...,...,...
36190,380841,[]
36193,380864,[]
37150,380864,[]
37280,381353,[]


### Links Key(s) Stats

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
movieId,int64,45843,45843,100.0
imdbId,int64,45843,45843,100.0
tmdbId,float64,45624,45594,99.456842


### Links Key(s) Stats (Cleaned)

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
movieId,int64,45595,45595,100.0
imdbId,int64,45595,45595,100.0
tmdbId,float64,45594,45594,99.997807


#### Duplicates of Key: ['movieId']

Unnamed: 0,movieId,imdbId,tmdbId


#### Duplicates of Key: ['imdbId']

Unnamed: 0,movieId,imdbId,tmdbId


#### Duplicates of Key: ['tmdbId']

Unnamed: 0,movieId,imdbId,tmdbId
5905,6003,290538,4912.0
34144,144606,270288,4912.0
9215,27136,165303,5511.0
7388,7587,62229,5511.0
45197,174533,235679,10991.0
...,...,...,...
24673,115254,21733,
24775,115715,3670792,
24802,115821,3900116,
28570,128734,4438688,


### Ratings Key(s) Stats

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
userId,int64,26024289,270896,1.040935
movieId,int64,26024289,45115,0.173357


### Ratings Key(s) Stats (Cleaned)

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
userId,int64,26024289,270896,1.040935
movieId,int64,26024289,45115,0.173357


#### Duplicates of Key: ['userId', 'movieId']

Unnamed: 0,userId,movieId,rating,timestamp


### Wiki Movie Plots Key(s) Stats

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
Wiki Page,object,34886,34070,97.660953


### Wiki Movie Plots Key(s) Stats (Cleaned)

Unnamed: 0,Data Type,Count,Unique Values,Unique Rate
Wiki Page,object,34070,34070,100.0


#### Duplicates of Key: ['Wiki Page']

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
17813,2009,$9.99,Australian,Tatia Rosenthal,Geoffrey Rush\r\nJoel Edgerton\r\nAnthony LaPa...,animation drama,https://en.wikipedia.org/wiki/$9.99,The film mainly focuses on 28-year-old Dave Pe...
17796,2008,$9.99,Australian,Tatia Rosenthal,"Geoffrey Rush, Joel Edgerton, Samuel Johnson",animation,https://en.wikipedia.org/wiki/$9.99,The film mainly focuses on 28-year-old Dave Pe...
21611,2016,100 Streets,British,Director: Jim O'Hanlon,"Director: Jim O'Hanlon\r\nCast: Idris Elba, Ge...",unknown,https://en.wikipedia.org/wiki/100_Streets,The film centers on three characters who have ...
17168,2017,100 Streets,American,Jim O'Hanlon,Jim O'Hanlon (director); Leon F. Butler (scree...,drama,https://en.wikipedia.org/wiki/100_Streets,The film centers on three characters who have ...
32126,2002,123,Telugu,K. Subhaash,"Prabhu Deva, Jyothika, Raju Sundaram, Nagendra...",comedy,https://en.wikipedia.org/wiki/123_(film),"Tirupathi (Prabhu Deva), Pazhani (Raju Sundara..."
...,...,...,...,...,...,...,...,...
32592,2013,Toofan,Telugu,Apoorva Lakhia,"Ram Charan Tej, Priyanka Chopra, Prakash Raj, ...",action,https://en.wikipedia.org/wiki/Zanjeer_(2013_film),ACP Vijay Khanna (Ram Charan) is a brutally ho...
28680,2010,Zenda,Marathi,Avdhoot Gupte,"Sachit Patil, Santosh Juvekar, Pushkar Shotri,...",politics,https://en.wikipedia.org/wiki/Zenda_(film),The movie is based on actual life of party wor...
28673,2009,Zenda,Marathi,Avadhoot Gupte,"Siddharth Chandekar,Chinmay Mandlekar, Santosh...",political drama,https://en.wikipedia.org/wiki/Zenda_(film),The movie is based on actual life of party wor...
29177,1958,Zimbo Dubbed from Hindi,Tamil,Homi Wadia,"Azad, Krishna Kumari, Chitra, Achla Sachdev, S...",unknown,https://en.wikipedia.org/wiki/Zimbo_(film),Professor Chakravarty and his wife Uma (Achala...


In [117]:
for data_source in data_sources.values():
    file_name = os.path.basename(data_source._path)

    data_source.df.to_csv(f"./deduped_data/{file_name}", index=False)