# IMDB Movie Dataset Cleaning

### Import Pandas Library

In [50]:
import pandas as pd

### Read the data file

In [51]:
df = pd.read_csv(r"datasource/imdb_movies.csv", encoding='unicode_escape', low_memory=False)
df.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
64804,tt2131532,Hidden: Senza via di scampo,Hidden,2015,2015-09-15,"Horror, Sci-Fi, Thriller",84,USA,English,"Matt Duffer, Ross Duffer","Matt Duffer, Ross Duffer",Primal Pictures (II),"Alexander Skarsgård, Andrea Riseborough, Emily...",A family takes refuge in a bomb shelter to avo...,6.4,19176,,,,,111.0,29.0
80615,tt6466058,Ask for Jane,Ask for Jane,2018,2019-05-17,"Drama, History",108,USA,,Rachel Carey,"Rachel Carey, Cait Cortelyou",Carolines Entertainment,"Cody Horn, Cait Cortelyou, Sarah Steele, Sarah...",A group of determined Midwestern women begin p...,5.4,165,,,,,11.0,2.0


### Set the column width to maximum to display all the columns

In [52]:
# Display all the columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)
df.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0


### Analyze the dataset
- Check the data types
- Check the number of records and number of columns

In [53]:
rows, columns = df.shape
print(f"Total Rows present are : {rows}, and total Columns are : {columns}")

Total Rows present are : 85855, and total Columns are : 22


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               84954 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

# Data Cleaning Process

### Creating copy of the data

In [55]:
mv = df.copy()
mv.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
75947,tt4849176,Sneekweek,Sneekweek,2016,2016-01-21,"Comedy, Thriller",108,Netherlands,Dutch,Martijn Heijne,Alex van Galen,Farmhouse Film & TV,"Carolien Spoor, Jelle de Jong, Jord Knotter, H...","Six friends party for a week during the ""Sneek...",4.7,1084,,,$ 597149,,4.0,19.0
82955,tt7635036,Love Immortal,Love Immortal,2019,2019-07-19,Horror,104,USA,,Robert Joseph Butler,"Robert Joseph Butler, Alexis Irvine",Nu Wave Films,"Aphrodite Nikolovski, Richard Tyson, Jordan Tr...",Follows three generations of vampire women tha...,7.4,111,,,,,5.0,


### Column: [imdb_title_id]

In [56]:
mv['imdb_title_id'].unique()

array(['tt0000009', 'tt0000574', 'tt0001892', ..., 'tt9911774',
       'tt9914286', 'tt9914942'], dtype=object)

- We can see that all the values are prefixed with 'tt' which can be removed 
- Also the column name can be renamed to 'id'

In [57]:
# Strip the 'tt' from the imdb_title_id
mv['imdb_title_id'] = mv['imdb_title_id'].str.strip('tt')
mv['imdb_title_id'].sample(10)

40104    0283383
73694    4087850
82354    7335186
61585    1748122
26139    0102565
62438    1833676
67637    2518294
26410    0103763
65419    2210569
14654    0066245
Name: imdb_title_id, dtype: object

#### Rename the column to 'imdb_title_id' to 'id'

In [58]:
# Rename column 'imdb_title_id' to 'id'
mv.rename(columns={'imdb_title_id':'id'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
67807,2557478,Pacific Rim: La rivolta,Pacific Rim: Uprising,2018,2018-03-22,"Action, Adventure, Sci-Fi",111,"UK, China, Japan, USA","English, Mandarin, Russian",Steven S. DeKnight,"Steven S. DeKnight, Emily Carmichael",Legendary Entertainment,"John Boyega, Scott Eastwood, Cailee Spaeny, Bu...","Jake Pentecost, son of Stacker Pentecost, reun...",5.6,102915,$ 150000000,$ 59874525,$ 290930148,44.0,741.0,304.0
33680,160644,Passion of Mind,Passion of Mind,2000,2000-01-07,"Drama, Mystery, Romance",105,USA,"English, French",Alain Berliner,"Ronald Bass, David Field",Lakeshore Entertainment,"Eloise Eonnet, Hadrian Dagannaud-Brouard, Chay...",A psychological romantic thriller where fantas...,5.5,3127,,$ 769272,$ 769272,28.0,42.0,50.0


#### Convert the column 'imdb_title_id' to integer type

In [59]:
# Convert the column 'imdb_title_id' to integer type
mv['id'] = mv['id'].astype(int)
mv['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 85855 entries, 0 to 85854
Series name: id
Non-Null Count  Dtype
--------------  -----
85855 non-null  int64
dtypes: int64(1)
memory usage: 670.9 KB


### Column: [title] and [original_title]

In [60]:
# Check the unique values
df['title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- The 'title' column does not has any data anomaly and does not require cleaning

In [61]:
# Check the unique values in 'original_title'
df['original_title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- Both the columns are same and the column 'title' can be dropped

#### Drop the column 'title'

In [62]:
# Drop the column title
mv.drop(columns=['title'],inplace=True)

#### Rename the column 'original_title' to 'title'

In [63]:
# Rename the column 'original_title' to 'title' 
mv.rename(columns={'original_title':'title'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
82187,7262206,Karuppan,2017,2017-09-29,"Action, Thriller",144,India,Tamil,"R. Panneerselvam, Sami Rajalingam",R. Panneerselvam,Shri Sai Raam Creations,"Kishore Kumar G., Kaveri, Sharath Lohitashwa, ...",Kathir (Bobby Simha) has an interest for Anbu ...,5.6,533,,,$ 66101,,6.0,5.0
79949,6206564,Trapped,2016,2017-03-17,"Drama, Thriller",105,India,Hindi,Vikramaditya Motwane,"Amit Joshi, Hardik Mehta",Phantom Films,"Rajkummar Rao, Geetanjali Thapa, Shiladitya Se...",A man struggles to survive after he unintentio...,7.5,9889,,,$ 38349,,73.0,24.0


### Column: [year] and [date_published]
- We can drop the 'year' column
- from the 'date_published' column we will extract the year and further rename the column to 'year'

In [64]:
# Drop the column 'year'
mv.drop(columns=['year'],inplace=True)

#### Extract year from the column 'date_published'

In [65]:
# Convert the column 'published_year' to date_time
mv['date_published'] = mv['date_published'].str[:4]

In [66]:
mv['date_published'].sample(5)

49884    2006
19722    1980
28656    1996
21448    1984
49923    2007
Name: date_published, dtype: object

#### Rename the column 'date_published' to 'year'

In [67]:
mv.rename(columns={'date_published': 'year'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
55697,1228999,Prathigna,1982,"Comedy, Family",136,India,Telugu,Boyani Subbarao,,Sree Lakshmi Prasanna Pictures,"Mohan Babu, Kavitha",,1.1,142,,,,,,
75205,4619908,Last Ones Out,2015,"Drama, Horror, Thriller",76,South Africa,English,Howard Fyvie,Howard Fyvie,,"Greg Kriek, Christia Visser, Tshamano Sebe, Vu...",A sceptical American man is stranded in the mi...,4.4,199,,,,,7.0,3.0


### Column: [genre]

In [68]:
mv['genre'].sample(10)

7229     Action, Adventure, Romance
4076                Comedy, Mystery
69422                        Comedy
9454                          Drama
72340                        Action
1727               Adventure, Drama
9795                   Drama, Crime
8237        Crime, Drama, Film-Noir
53893                         Drama
43726                Horror, Sci-Fi
Name: genre, dtype: object

#### Genre column contains multiple genres for a single movie, but it must be noted that:
- for each movie we have a maximum of 3 genres
- we can split these multiple genres into three columns 'Genre 1' , 'Genre 2', and 'Genre 3'
- The splitted values can be assigned to the three columns 'Genre 1' , 'Genre 2', and 'Genre 3' respectively

In [69]:
# create a temp df 'genre_split' to store the splitted genres
genre_split  = mv['genre'].str.split(',', expand = True)

In [70]:
# Rename the columns to Genre1, Genre2, Genre3
genre_cols = ['genre1', 'genre2', 'genre3']
genre_split.columns = genre_cols

In [71]:
# Concatenate the split genre df to original df
for i in range(len(genre_cols)):
    mv.insert(loc=mv.columns.get_loc('genre') + 1 + i, column=genre_cols[i], value=genre_split.iloc[:, i])
mv[['genre1','genre2','genre3']].sample(10)

Unnamed: 0,genre1,genre2,genre3
62767,Comedy,,
48197,Comedy,,
67983,Comedy,Drama,Romance
54151,History,,
49288,Horror,Thriller,
4732,Mystery,Romance,
57376,Adventure,Drama,
55995,Action,Drama,Thriller
15503,Horror,Mystery,
45651,Action,Crime,Drama


In [72]:
# Drop the column genre 
mv.drop(columns=['genre'],inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre1,genre2,genre3,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
66585,2357208,Run Baby Run,2012,Action,Comedy,Thriller,180,India,"Malayalam, Telugu","Joshiy, Joshiy",Sachy,Galaxy Films,"Mohanlal, Amala Paul, Biju Menon, Saikumar, Si...",A reporter and a cameraman become try to uncov...,6.6,1501,,,,,6.0,1.0
73397,4004084,Antisocial 2,2018,Horror,,,90,Canada,English,Cody Calahan,"Chad Archibald, Cody Calahan",Black Fawn Films,"Michelle Mylett, Stephen Bogaert, Josette Halp...",Years after having her newborn child stolen fr...,4.5,364,,,$ 11924,,5.0,8.0
