# IMDB Movie Dataset Cleaning

### Import Pandas Library

In [245]:
import pandas as pd

### Read the data file

In [246]:
df = pd.read_csv(r"datasource/imdb_movies.csv", encoding='unicode_escape', low_memory=False)
df.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
34343,tt0170544,SF: Episode One,SF: Episode One,1998,1998-08-01,"Action, Adventure, Comedy",111,Japan,Japanese,Hiroyuki Nakano,"Hiroyuki Nakano, Hiroshi Saitô",,"Morio Kazama, Mitsuru Fukikoshi, Tomoyasu Hote...",A noble young samurai searches for a thief who...,7.2,2537,,,,,21.0,27.0
40301,tt0286516,Respiro,Respiro,2002,2002-05-22,Drama,95,"Italy, France","Italian, Sicilian",Emanuele Crialese,Emanuele Crialese,Fandango,"Valeria Golino, Vincenzo Amato, Francesco Casi...","On an impoverished Italian island, a free-spir...",7.0,4465,,$ 1072834,$ 7309845,65.0,47.0,69.0


### Set the column width to maximum to display all the columns

In [247]:
# Display all the columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)
df.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0


### Analyze the dataset
- Check the data types
- Check the number of records and number of columns

In [248]:
rows, columns = df.shape
print(f"Total Rows present are : {rows}, and total Columns are : {columns}")

Total Rows present are : 85855, and total Columns are : 22


In [249]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               84954 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

# Data Cleaning Process

### Creating copy of the data

In [250]:
mv = df.copy()
mv.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
17841,tt0075704,Audrey Rose,Audrey Rose,1977,1977-09-30,"Drama, Fantasy, Horror",113,USA,English,Robert Wise,"Frank De Felitta, Frank De Felitta",Sterobcar Productions,"Marsha Mason, Anthony Hopkins, John Beck, Susa...",A stranger attempts to convince a happily marr...,5.8,5330,,,,,85.0,56.0
81764,tt7060344,Ratsasan,Ratsasan,2018,2018-10-05,"Action, Crime, Thriller",170,India,Tamil,Ram Kumar,Ram Kumar,Axess Film Factory,"Vishnu Vishal, Radha Ravi, Sangili Murugan, Ni...",A Sub-Inspector sets out in pursuit of a myste...,8.7,19367,,,,,426.0,9.0


### Column: [imdb_title_id]

In [251]:
mv['imdb_title_id'].unique()

array(['tt0000009', 'tt0000574', 'tt0001892', ..., 'tt9911774',
       'tt9914286', 'tt9914942'], dtype=object)

- We can see that all the values are prefixed with 'tt' which can be removed 
- Also the column name can be renamed to 'id'

In [252]:
# Strip the 'tt' from the imdb_title_id
mv['imdb_title_id'] = mv['imdb_title_id'].str.strip('tt')
mv[['imdb_title_id']].sample(10)

Unnamed: 0,imdb_title_id
7143,44462
6101,41199
42372,330994
71781,3567368
45094,398808
70420,3230300
63456,1959409
72517,3750872
52247,10006270
21491,87571


#### Rename the column to 'imdb_title_id' to 'id'

In [253]:
# Rename column 'imdb_title_id' to 'id'
mv.rename(columns={'imdb_title_id':'id'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
12854,61069,Tatsu no ko Tarô,Tatsu no ko Tarô,1979,1979-03-17,"Adventure, Family, Fantasy",75,Japan,Japanese,"Kirio Urayama, Peter Fernandez","Miyoko Matsutani, Takashi Mitsui",Toei Animation,"Kazuo Kitamura, Sayuri Yoshinaga, Jun'ya Katô,...",A young boy has to make a voyage to a distant ...,7.5,744,,,,,9.0,7.0
65306,2195804,Lombard,Lombard,2013,2013-09-19,"Comedy, Crime",88,Ukraine,Russian,Lyubomir Levitskiy,Lyubomir Levitskiy,Kinofabrika Production,"Borys Abramov, Andrey Burym, Deni Dadaev, Vale...",Mark and Yasha (played by Denis Nekyforov and ...,6.7,136,$ 3000000,,,,,


#### Convert the column 'imdb_title_id' to integer type

In [254]:
# Convert the column 'imdb_title_id' to integer type
mv['id'] = mv['id'].astype(int)
mv['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 85855 entries, 0 to 85854
Series name: id
Non-Null Count  Dtype
--------------  -----
85855 non-null  int64
dtypes: int64(1)
memory usage: 670.9 KB


### Column: [title] and [original_title]

In [255]:
# Check the unique values
df['title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- The 'title' column does not has any data anomaly and does not require cleaning

In [256]:
# Check the unique values in 'original_title'
df['original_title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- Both the columns are same and the column 'title' can be dropped

#### Drop the column 'title'

In [257]:
# Drop the column title
mv.drop(columns=['title'],inplace=True)

#### Rename the column 'original_title' to 'title'

In [258]:
# Rename the column 'original_title' to 'title' 
mv.rename(columns={'original_title':'title'}, inplace=True)
mv[['title']].sample(10)

Unnamed: 0,title
31548,Assault of the Party Nerds 2: The Heavy Pettin...
3757,You Can't Fool Your Wife
46579,"Good Night, and Good Luck."
56471,I taket lyser stjärnorna
50691,Die Wilden Hühner und die Liebe
13984,"Goodbye, Columbus"
5661,Roses Are Red
45990,Sommersturm
32670,The Boys
51434,Los abrazos rotos


### Column: [year] and [date_published]
- We can drop the 'year' column
- from the 'date_published' column we will extract the year and further rename the column to 'year'

In [259]:
# Drop the column 'year'
mv.drop(columns=['year'],inplace=True)

#### Extract year from the column 'date_published'

In [260]:
# Convert the column 'published_year' to date_time
mv['date_published'] = mv['date_published'].str[:4]

In [261]:
mv[['date_published']].sample(5)

Unnamed: 0,date_published
58372,2009
8894,1956
38760,1967
22689,1987
9984,1959


#### Rename the column 'date_published' to 'year'

In [262]:
mv.rename(columns={'date_published': 'year'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
33700,161010,The Trench,1999,"Drama, History, War",98,"France, UK",English,William Boyd,William Boyd,Arts Council of England,"Paul Nicholls, Daniel Craig, Julian Rhind-Tutt...",A story about a group of soldiers' last days b...,6.0,2496,,,,58.0,35.0,14.0
18418,77470,Dracula's Dog,1977,Horror,87,"USA, Italy",English,Albert Band,Frank Ray Perilli,EMI Television,"Michael Pataki, Jan Shutan, Libby Chase, John ...",Russian soldiers accidentally unleash the serv...,4.4,1136,,,,,40.0,38.0


### Column: [genre]

In [263]:
mv[['genre']].sample(10)

Unnamed: 0,genre
5622,"Adventure, Family, Mystery"
79127,Drama
45419,"Action, Drama, Sci-Fi"
8333,"Crime, Drama, Film-Noir"
40361,Drama
26707,"Drama, Thriller"
11152,"Adventure, Drama"
32574,Drama
73018,"Action, Drama, Thriller"
61068,"Comedy, Drama"


#### Genre column contains multiple genres for a single movie, but it must be noted that:
- for each movie we have a maximum of 3 genres
- we can split these multiple genres into three columns 'Genre 1' , 'Genre 2', and 'Genre 3'
- The splitted values can be assigned to the three columns 'Genre 1' , 'Genre 2', and 'Genre 3' respectively

In [264]:
# create a temp df 'genre_split' to store the splitted genres
genre_split  = mv['genre'].str.split(',', expand = True)

In [265]:
# Rename the columns to Genre1, Genre2, Genre3
genre_cols = ['genre1', 'genre2', 'genre3']
genre_split.columns = genre_cols

In [266]:
# Concatenate the split genre df to original df
for i in range(len(genre_cols)):
    mv.insert(loc=mv.columns.get_loc('genre') + 1 + i, column=genre_cols[i], value=genre_split.iloc[:, i])
mv[['genre1','genre2','genre3']].sample(10)

Unnamed: 0,genre1,genre2,genre3
81413,Drama,Family,
6142,Western,,
84913,Drama,History,
85270,Horror,,
20334,Comedy,Drama,
40173,Drama,Romance,
47019,Action,Drama,
83938,Comedy,,
53227,Action,Drama,Thriller
82304,Action,Adventure,Drama


In [267]:
# Drop the column genre 
mv.drop(columns=['genre'],inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre1,genre2,genre3,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
61421,1732571,Between Notes,2010,Comedy,Drama,Music,78,USA,English,Christopher Grissom,Matthew Austin Brown,BFM Creative,"Summer Ames, Shelby Cook, Brittany Joyner, Bra...",Between Notes is a modern day musical about a ...,4.7,107,,,,,5.0,
76749,5097398,Les 3 p'tits cochons 2,2016,Comedy,Romance,,102,Canada,French,Jean-François Pouliot,"Claude Lalonde, Pierre Lamothe",Christal Films,"Paul Doucet, Guillaume Lemay-Thivierge, Patric...",Rémy (Paul Doucet) is always between two plane...,6.4,304,,,$ 11414,,,2.0


### Column: [duration]
- the duration column has the run time for the movie in minutes which we will convert to hours

In [268]:
# Check the data type
mv['duration'].dtype

dtype('int64')

In [269]:
mv['duration'].unique()

array([ 45,  70,  53, 100,  68,  60,  85, 120,  55, 121,  54,  96,  61,
        90,  50,  88,  72,  78, 148,  52, 124,  59,  63,  84,  65,  81,
       199,  74,  80,  82,  67,  56, 195,  77,  71,  46, 421, 105,  57,
        58,  73,  64,  62, 163, 300, 116,  69, 125,  97, 138, 112,  91,
        87,  48,  83, 136,  75,  94, 117,  93,  76, 418,  86, 223,  99,
       122, 207, 166,  95,  92,  66, 106, 145, 167,  79, 107, 109, 104,
        89, 102, 150, 131,  47, 119,  98, 110, 143, 137, 128, 101, 183,
       133, 135, 111, 141, 115, 153, 129, 123, 108, 155, 151, 170, 140,
       113, 114, 250, 132, 118, 160, 240, 142, 103, 144,  49, 127, 156,
       130, 165, 147, 152, 226, 227, 231, 218, 200, 210, 225, 243,  51,
       281, 208, 306, 257, 245, 255, 328, 168, 269, 303, 290, 212, 224,
       126, 260, 299, 264, 149, 211, 302, 238, 258, 263, 215, 265, 205,
       237, 220, 285, 293, 267, 216, 241,  43, 134, 184, 261, 197, 244,
       146, 177, 196, 154, 189, 219, 201, 180, 159, 192, 139, 17

In [270]:
# Convert the minutes in the duration column to hours
mv['duration'] = (mv['duration']/60).round(2)
mv['duration'].unique()

array([ 0.75,  1.17,  0.88,  1.67,  1.13,  1.  ,  1.42,  2.  ,  0.92,
        2.02,  0.9 ,  1.6 ,  1.02,  1.5 ,  0.83,  1.47,  1.2 ,  1.3 ,
        2.47,  0.87,  2.07,  0.98,  1.05,  1.4 ,  1.08,  1.35,  3.32,
        1.23,  1.33,  1.37,  1.12,  0.93,  3.25,  1.28,  1.18,  0.77,
        7.02,  1.75,  0.95,  0.97,  1.22,  1.07,  1.03,  2.72,  5.  ,
        1.93,  1.15,  2.08,  1.62,  2.3 ,  1.87,  1.52,  1.45,  0.8 ,
        1.38,  2.27,  1.25,  1.57,  1.95,  1.55,  1.27,  6.97,  1.43,
        3.72,  1.65,  2.03,  3.45,  2.77,  1.58,  1.53,  1.1 ,  1.77,
        2.42,  2.78,  1.32,  1.78,  1.82,  1.73,  1.48,  1.7 ,  2.5 ,
        2.18,  0.78,  1.98,  1.63,  1.83,  2.38,  2.28,  2.13,  1.68,
        3.05,  2.22,  2.25,  1.85,  2.35,  1.92,  2.55,  2.15,  2.05,
        1.8 ,  2.58,  2.52,  2.83,  2.33,  1.88,  1.9 ,  4.17,  2.2 ,
        1.97,  2.67,  4.  ,  2.37,  1.72,  2.4 ,  0.82,  2.12,  2.6 ,
        2.17,  2.75,  2.45,  2.53,  3.77,  3.78,  3.85,  3.63,  3.33,
        3.5 ,  3.75,

#### Rename the column to duration(hours)

In [271]:
# Rename the column to duration(hours)
mv.rename(columns={'duration': 'duration(hours)'},inplace=True)
mv[['duration(hours)']].sample(10)

Unnamed: 0,duration(hours)
9380,1.35
56682,1.65
31941,1.5
18749,1.58
72608,1.65
27071,2.0
18852,1.77
19868,1.45
57683,1.0
8345,1.6
