# IMDB Movie Dataset Cleaning

### Import Pandas Library

In [140]:
import pandas as pd

### Read the data file

In [141]:
df = pd.read_csv(r"datasource/imdb_movies.csv", encoding='unicode_escape', low_memory=False)
df.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
80885,tt6596796,Mai mee Samui samrab ter,Mai mee Samui samrab ter,2017,2018-02-01,"Drama, Thriller",108,"Thailand, Germany, Norway","Thai, English",Pen-Ek Ratanaruang,Pen-Ek Ratanaruang,Augenschein Filmproduktion,"David Asavanond, Laila Boonyasak, Karen Gemma ...",A soap opera actress finds herself increasingl...,6.1,406,,,,,1.0,9.0
41813,tt0317171,Sud sanaeha,Sud sanaeha,2002,2002-10-09,"Drama, Romance",125,"Thailand, France",Thai,Apichatpong Weerasethakul,Apichatpong Weerasethakul,Anna Sanders Films,"Kanokporn Tongaram, Min Oo, Jenjira Pongpas, S...",The story of a love affair that begins during ...,7.0,2207,,,$ 18720,77.0,14.0,38.0


### Set the column width to maximum to display all the columns

In [142]:
# Display all the columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)
df.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0


### Analyze the dataset
- Check the data types
- Check the number of records and number of columns

In [143]:
rows, columns = df.shape
print(f"Total Rows present are : {rows}, and total Columns are : {columns}")

Total Rows present are : 85855, and total Columns are : 22


In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               84954 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

# Data Cleaning Process

### Creating copy of the data

In [145]:
mv = df.copy()
mv.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
14384,tt0065517,O Cangaceiro,O Cangaceiro,1969,1970-06-23,Western,104,"Italy, Spain",Italian,Giovanni Fago,"Bernardino Zapponi, José Luis Jerez Aloza",Medusa Distribuzione,"Tomas Milian, Ugo Pagliai, Eduardo Fajardo, Ho...","Espidito, survives the massacre of his entire ...",6.3,172,,,,,2.0,5.0
66225,tt2316204,Alien: Covenant,Alien: Covenant,2017,2017-05-11,"Horror, Sci-Fi, Thriller",122,"USA, UK",English,Ridley Scott,"Dan O'Bannon, Ronald Shusett",Twentieth Century Fox,"Michael Fassbender, Katherine Waterston, Billy...","The crew of a colony ship, bound for a remote ...",6.4,248096,$ 97000000,$ 74262031,$ 240891763,65.0,2069.0,586.0


### Column: [imdb_title_id]

In [146]:
mv['imdb_title_id'].unique()

array(['tt0000009', 'tt0000574', 'tt0001892', ..., 'tt9911774',
       'tt9914286', 'tt9914942'], dtype=object)

- We can see that all the values are prefixed with 'tt' which can be removed 
- Also the column name can be renamed to 'id'

In [147]:
# Strip the 'tt' from the imdb_title_id
mv['imdb_title_id'] = mv['imdb_title_id'].str.strip('tt')
mv[['imdb_title_id']].sample(10)

Unnamed: 0,imdb_title_id
5802,40281
10716,54934
26042,102266
52200,997084
15614,68904
25433,100142
73059,3894340
75045,4557014
32662,139745
77661,5439812


#### Rename the column to 'imdb_title_id' to 'id'

In [148]:
# Rename column 'imdb_title_id' to 'id'
mv.rename(columns={'imdb_title_id':'id'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
77524,5345806,"El que busca, encuentra","El que busca, encuentra",2017,2017-02-24,"Comedy, Romance",93,Mexico,Spanish,Pitipol Ybarra,Miguel Burra,All About Media,"Ana Brenda Contreras, Claudio Lafarga, Esmeral...",Esperanza and Marcos are two persons who belie...,5.2,116,MXN 25000000,,$ 1149667,,,
81266,6819126,Paradise,Paradise,2016,2019-01-23,"Comedy, Drama",80,"Iran, Germany",Persian,Ali Atshani,"Mahdi Alimirzaee, Ali Atshani",American Brightlight Film Productions,"Javad Ezati, Mehran Rajabi, Mohammad Ali Najaf...",Two young theologues are going to participate ...,4.0,339,$ 400000,,,,1.0,4.0


#### Convert the column 'imdb_title_id' to integer type

In [149]:
# Convert the column 'imdb_title_id' to integer type
mv['id'] = mv['id'].astype(int)
mv['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 85855 entries, 0 to 85854
Series name: id
Non-Null Count  Dtype
--------------  -----
85855 non-null  int64
dtypes: int64(1)
memory usage: 670.9 KB


### Column: [title] and [original_title]

In [150]:
# Check the unique values
df['title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- The 'title' column does not has any data anomaly and does not require cleaning

In [151]:
# Check the unique values in 'original_title'
df['original_title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- Both the columns are same and the column 'title' can be dropped

#### Drop the column 'title'

In [152]:
# Drop the column title
mv.drop(columns=['title'],inplace=True)

#### Rename the column 'original_title' to 'title'

In [153]:
# Rename the column 'original_title' to 'title' 
mv.rename(columns={'original_title':'title'}, inplace=True)
mv[['title']].sample(10)

Unnamed: 0,title
46575,Bell Witch Haunting
76548,Dismissed
14340,The AristoCats
27720,Tikhie stranitsy
67821,Bordering on Bad Behavior
52123,A Love Story
32308,Duets
78635,Velainu Vandhutta Vellaikaaran
57226,Tung moon
24744,Malarek


### Column: [year] and [date_published]
- We can drop the 'year' column
- from the 'date_published' column we will extract the year and further rename the column to 'year'

In [154]:
# Drop the column 'year'
mv.drop(columns=['year'],inplace=True)

#### Extract year from the column 'date_published'

In [155]:
# Convert the column 'published_year' to date_time
mv['date_published'] = mv['date_published'].str[:4]

In [156]:
mv[['date_published']].sample(5)

Unnamed: 0,date_published
59107,2009
381,1924
40106,2001
28543,1995
41027,2001


#### Rename the column 'date_published' to 'year'

In [157]:
mv.rename(columns={'date_published': 'year'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
30492,118887,Cop Land,1997,"Crime, Drama, Thriller",105,USA,English,James Mangold,James Mangold,Miramax,"Sylvester Stallone, Harvey Keitel, Ray Liotta,...",The Sheriff of a suburban New Jersey community...,6.9,84547,$ 15000000,$ 44862187,$ 44862187,64.0,242.0,145.0
21735,88307,Tutti dentro,1984,"Comedy, Drama",108,Italy,Italian,Alberto Sordi,"Augusto Caminito, Rodolfo Sonego",Scena Film,"Alberto Sordi, Joe Pesci, Dalila Di Lazzaro, G...","The incorruptible judge Annibale Salvemini, st...",5.8,229,,,,,2.0,2.0


### Column: [genre]

In [158]:
mv[['genre']].sample(10)

Unnamed: 0,genre
16924,"Comedy, Crime, Drama"
20998,"Comedy, Drama"
50765,Comedy
30451,"Adventure, Drama, Fantasy"
2051,"Comedy, Musical, Romance"
72594,"Drama, Mystery"
77725,"Action, Drama, Thriller"
66827,"Drama, Romance"
71390,Horror
73438,Action


#### Genre column contains multiple genres for a single movie, but it must be noted that:
- for each movie we have a maximum of 3 genres
- we can split these multiple genres into three columns 'Genre 1' , 'Genre 2', and 'Genre 3'
- The splitted values can be assigned to the three columns 'Genre 1' , 'Genre 2', and 'Genre 3' respectively

In [159]:
# create a temp df 'genre_split' to store the splitted genres
genre_split  = mv['genre'].str.split(',', expand = True)

In [160]:
# Rename the columns to Genre1, Genre2, Genre3
genre_cols = ['genre1', 'genre2', 'genre3']
genre_split.columns = genre_cols

In [161]:
# Concatenate the split genre df to original df
for i in range(len(genre_cols)):
    mv.insert(loc=mv.columns.get_loc('genre') + 1 + i, column=genre_cols[i], value=genre_split.iloc[:, i])
mv[['genre1','genre2','genre3']].sample(10)

Unnamed: 0,genre1,genre2,genre3
60600,Drama,,
67674,Comedy,Drama,Family
36864,Drama,,
26894,Crime,Drama,Mystery
33310,Thriller,,
35845,Comedy,,
13633,Comedy,,
9349,Drama,,
54544,Comedy,Crime,Drama
18821,Crime,Drama,


In [162]:
# Drop the column genre 
mv.drop(columns=['genre'],inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre1,genre2,genre3,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
81137,6749072,El peso de la ley,2017,Crime,Drama,Mystery,100,Argentina,Spanish,Fernán Mirás,"Roberto Gispert, Fernán Mirás",Aleph Media,"Paola Barrientos, María Onetto, Daniel Lambert...",Gloria is a lawyer who throughout her career h...,6.9,132,,,$ 114080,,1.0,
82736,7526338,Damat Takimi,2017,Comedy,Romance,,109,Turkey,Turkish,Doga Can Anafarta,Doga Can Anafarta,Asteros Film,"Furkan Andic, Özgürcan Cevik, Pelin Akil, Açel...","Yigit, Omer, Serdar, Onat and Can are five fri...",5.5,526,,,$ 595949,,1.0,1.0


### Column: [duration]
- the duration column has the run time for the movie in minutes which we will convert to hours

In [163]:
# Check the data type
mv['duration'].dtype

dtype('int64')

In [164]:
mv['duration'].unique()

array([ 45,  70,  53, 100,  68,  60,  85, 120,  55, 121,  54,  96,  61,
        90,  50,  88,  72,  78, 148,  52, 124,  59,  63,  84,  65,  81,
       199,  74,  80,  82,  67,  56, 195,  77,  71,  46, 421, 105,  57,
        58,  73,  64,  62, 163, 300, 116,  69, 125,  97, 138, 112,  91,
        87,  48,  83, 136,  75,  94, 117,  93,  76, 418,  86, 223,  99,
       122, 207, 166,  95,  92,  66, 106, 145, 167,  79, 107, 109, 104,
        89, 102, 150, 131,  47, 119,  98, 110, 143, 137, 128, 101, 183,
       133, 135, 111, 141, 115, 153, 129, 123, 108, 155, 151, 170, 140,
       113, 114, 250, 132, 118, 160, 240, 142, 103, 144,  49, 127, 156,
       130, 165, 147, 152, 226, 227, 231, 218, 200, 210, 225, 243,  51,
       281, 208, 306, 257, 245, 255, 328, 168, 269, 303, 290, 212, 224,
       126, 260, 299, 264, 149, 211, 302, 238, 258, 263, 215, 265, 205,
       237, 220, 285, 293, 267, 216, 241,  43, 134, 184, 261, 197, 244,
       146, 177, 196, 154, 189, 219, 201, 180, 159, 192, 139, 17

In [165]:
# Convert the minutes in the duration column to hours
mv['duration'] = (mv['duration']/60).round(2)
mv[['duration']].sample(5)

Unnamed: 0,duration
35707,1.38
66834,1.75
25444,2.15
60320,1.67
43838,1.62


#### Rename the column to duration(hours)

In [166]:
# Rename the column to duration(hours)
mv.rename(columns={'duration': 'duration(hours)'},inplace=True)
mv[['duration(hours)']].sample(10)

Unnamed: 0,duration(hours)
72266,1.52
3060,1.5
6753,1.25
81951,1.25
23178,1.38
78774,2.13
59935,2.1
52077,1.45
42919,2.43
52357,1.27


### Column: [Country]

In [167]:
# Check the data type for Country column
mv['country'].sample(10)

19981                            Czechoslovakia
72567                                       USA
72224                               Mexico, USA
73472                                        UK
29896                   Czech Republic, Germany
62082                                       USA
67757    UK, New Zealand, USA, India, Australia
29607                                Spain, USA
58782                                    Russia
31872                    France, Belgium, Spain
Name: country, dtype: object

In [173]:
# clean the data to keep only one country for each movie
mv['country'] = mv['country'].str.split(',').str[0]
mv[['country']].sample(10)

Unnamed: 0,country
63295,USA
11164,USA
70747,Bulgaria
22266,USA
10711,France
52952,USA
4054,USA
69081,UK
4485,UK
67175,Indonesia


### Column: [Language]

In [176]:
# Check the data type for the column language
mv['language'].unique()

array([nan, 'English', 'Italian', ..., 'Persian, Urdu',
       'English, Swiss German, German',
       'English, Polish, Russian, German'], dtype=object)

In [181]:
# Fill the missing values with 'Not Available' 
mv['language'] = mv['language'].fillna('Not Available')
mv[['language']].sample(10)

Unnamed: 0,language
49998,English
77217,French
1142,English
25033,English
12081,German
64085,Serbian
37803,Hindi
81041,Arabic
43217,"Norwegian, Swedish"
24268,"Irish, English"
