# IMDB Movie Dataset Cleaning

### Import Pandas Library

In [155]:
import pandas as pd

### Read the data file

In [156]:
df = pd.read_csv(r"datasource/imdb_movies.csv", encoding='unicode_escape', low_memory=False)
df.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
31500,tt0122571,De komst van Joachim Stiller,De komst van Joachim Stiller,1976,1976-11-29,Mystery,120,"Netherlands, Belgium","Dutch, German, English",Harry Kümel,"Jean Ferry, Harry Kümel",Algemene Vereniging Radio Omroep (AVRO),"Hugo Metsers, Cox Habbema, Willeke van Ammelro...",Freek and Simone's lives are taken over by the...,7.6,306,,,,,5.0,4.0
20107,tt0082958,Caccia implacabile,The Pursuit of D.B. Cooper,1981,1981-11-13,"Adventure, Crime, Thriller",100,USA,English,"Roger Spottiswoode, Buzz Kulik","J.D. Reed, Jeffrey Alan Fiskin",PolyGram Pictures,"Robert Duvall, Treat Williams, Kathryn Harrold...",A speculation on the fate of the famous hijack...,5.6,1027,$ 12000000,$ 3702028,$ 3702028,,13.0,5.0


### Set the column width to maximum to display all the columns

In [157]:
# Display all the columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)
df.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0


### Analyze the dataset
- Check the data types
- Check the number of records and number of columns

In [158]:
rows, columns = df.shape
print(f"Total Rows present are : {rows}, and total Columns are : {columns}")

Total Rows present are : 85855, and total Columns are : 22


In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               84954 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

# Data Cleaning Process

### Creating copy of the data

In [160]:
mv = df.copy()
mv.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
47069,tt0446309,Death by Engagement,Death by Engagement,2005,2007-10-02,"Horror, Comedy",99,USA,English,Philip Creager,Philip Creager,Rounding 3rd Productions,"Sascha Knopf, Aaron McPherson, P.J. Soles, Iya...",A cursed ring bounces through an ensemble of d...,4.0,188,$ 800000,,,,6.0,9.0
54430,tt1134664,Tradire è un'arte - Boogie Woogie,Boogie Woogie,2009,2017-09-29,"Comedy, Drama",94,UK,English,Duncan Ward,"Danny Moynihan, Danny Moynihan",The Works International,"Gillian Anderson, Alan Cumming, Heather Graham...",A comedy of manners set against the backdrop o...,5.2,3530,$ 6000000,$ 2618,$ 48411,,28.0,30.0


### Column: [imdb_title_id]

In [161]:
mv['imdb_title_id'].unique()

array(['tt0000009', 'tt0000574', 'tt0001892', ..., 'tt9911774',
       'tt9914286', 'tt9914942'], dtype=object)

- We can see that all the values are prefixed with 'tt' which can be removed 
- Also the column name can be renamed to 'id'

In [162]:
# Strip the 'tt' from the imdb_title_id
mv['imdb_title_id'] = mv['imdb_title_id'].str.strip('tt')
mv['imdb_title_id'].sample(10)

33321    0154898
37147    0224156
13713    0063568
38202    0248926
40230    0285300
68080    2620490
48296    0473001
83255    7806430
80082    6237314
54814    1160539
Name: imdb_title_id, dtype: object

#### Rename the column to 'imdb_title_id' to 'id'

In [163]:
# Rename column 'imdb_title_id' to 'id'
mv.rename(columns={'imdb_title_id':'id'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
72069,3651326,All'ombra delle donne,L'ombre des femmes,2015,2015-05-27,"Drama, Romance",73,"France, Switzerland",French,Philippe Garrel,"Jean-Claude Carrière, Caroline Deruas-Garrel",SBS Productions,"Clotilde Courau, Stanislas Merhar, Lena Paugam...",Pierre and Manon are a pair of poor documentar...,6.5,1304,,$ 50291,$ 152802,74.0,6.0,61.0
17016,73029,Galileo,Galileo,1975,1976-05-27,"Biography, Drama",145,"UK, USA",English,Joseph Losey,"Barbara Bray, Bertolt Brecht",Cinévision Ltée,"Topol, Edward Fox, Colin Blakely, Georgia Brow...","This biopic is about Galileo Galilei, the seve...",6.6,481,,,,,8.0,9.0


#### Convert the column 'imdb_title_id' to integer type

In [164]:
# Convert the column 'imdb_title_id' to integer type
mv['id'] = mv['id'].astype(int)
mv['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 85855 entries, 0 to 85854
Series name: id
Non-Null Count  Dtype
--------------  -----
85855 non-null  int64
dtypes: int64(1)
memory usage: 670.9 KB


### Column: [title] and [original_title]

In [165]:
# Check the unique values
df['title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- The 'title' column does not has any data anomaly and does not require cleaning

In [166]:
# Check the unique values in 'original_title'
df['original_title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- Both the columns are same and the column 'title' can be dropped

#### Drop the column 'title'

In [167]:
# Drop the column title
mv.drop(columns=['title'],inplace=True)

#### Rename the column 'original_title' to 'title'

In [168]:
# Rename the column 'original_title' to 'title' 
mv.rename(columns={'original_title':'title'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
17501,74633,Hollywood Boulevard,1976,1976-04-25,"Comedy, Thriller",83,USA,English,"Allan Arkush, Joe Dante",Danny Opatoshu,New World Pictures,"Mary Woronov, Paul Bartel, George Wagner, Jona...","In this satire on 70s B-movie industry, a youn...",6.0,1016,$ 60000,,,,27.0,31.0
84244,8426594,The Wind,2018,2019-04-05,"Horror, Western",86,USA,English,Emma Tammi,Teresa Sutherland,Soapbox Films,"Caitlin Gerard, Julia Goldani Telles, Ashley Z...",A plains-woman faces the harshness and isolati...,5.5,5543,,$ 28252,$ 130974,66.0,95.0,87.0


### Column: [year] and [date_published]
- We can drop the 'year' column
- from the 'date_published' column we will extract the year and further rename the column to 'year'

In [169]:
# Drop the column 'year'
mv.drop(columns=['year'],inplace=True)

#### Extract year from the column 'date_published'

In [170]:
# Convert the column 'published_year' to date_time
mv['date_published'] = mv['date_published'].str[:4]

In [171]:
mv['date_published'].sample(5)

46455    2004
13632    1968
73089    2016
8784     1956
8877     1956
Name: date_published, dtype: object

#### Rename the column 'date_published' to 'year'

In [172]:
mv.rename(columns={'date_published': 'year'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
26966,105626,Toutes peines confondues,1992,"Crime, Drama, Thriller",107,France,French,Michel Deville,"Andrew Coburn, Rosalinde Deville",Eléfilm,"Patrick Bruel, Jacques Dutronc, Mathilda May, ...",A young police inspector is sent to Zurich to ...,6.3,281,,,,,2.0,
37353,230597,Oru Vadakkan Veeragatha,1989,"Drama, History",168,India,Malayalam,T. Hariharan,M.T. Vasudevan Nair,Grahalakshmi Productions,"Mammootty, Madhavi, Devan, Suresh Gopi, Balan ...",The film offers an alternative version of the ...,8.7,4112,INR 9800000,,,,13.0,2.0


### Column: [genre]

In [173]:
mv['genre'].sample(10)

42109                        Drama
36330                        Drama
31854                       Comedy
49590     Adventure, Comedy, Drama
57541       Action, Comedy, Sci-Fi
12277               Drama, Romance
49120                Comedy, Drama
55971     Crime, Mystery, Thriller
60542                Action, Drama
34629    Adventure, Comedy, Family
Name: genre, dtype: object

#### Genre column contains multiple genres for a single movie, but it must be noted that:
- for each movie we have a maximum of 3 genres
- we can split these multiple genres into three columns 'Genre 1' , 'Genre 2', and 'Genre 3'
- The splitted values can be assigned to the three columns 'Genre 1' , 'Genre 2', and 'Genre 3' respectively

In [174]:
# create a temp df 'genre_split' to store the splitted genres
genre_split  = mv['genre'].str.split(',', expand = True)

In [175]:
# Rename the columns to Genre1, Genre2, Genre3
genre_cols = ['genre1', 'genre2', 'genre3']
genre_split.columns = genre_cols

In [176]:
# Concatenate the split genre df to original df
for i in range(len(genre_cols)):
    mv.insert(loc=mv.columns.get_loc('genre') + 1 + i, column=genre_cols[i], value=genre_split.iloc[:, i])
mv[['genre1','genre2','genre3']].sample(10)

Unnamed: 0,genre1,genre2,genre3
56942,Comedy,Romance,
55860,Action,Horror,Thriller
55736,Romance,,
78568,Comedy,Romance,
35523,Action,Adventure,Drama
11650,Comedy,Music,
72721,Drama,,
23742,Adventure,Comedy,Romance
23317,Drama,Romance,
44662,Drama,,


In [177]:
# Drop the column genre 
mv.drop(columns=['genre'],inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre1,genre2,genre3,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
24220,96071,The Serpent and the Rainbow,1988,Fantasy,Horror,,98,USA,"English, French, Spanish",Wes Craven,"Wade Davis, Richard Maxwell",Universal Pictures,"Bill Pullman, Cathy Tyson, Zakes Mokae, Paul W...",An anthropologist goes to Haiti after hearing ...,6.5,21560,$ 7000000,$ 19595031,$ 19595031,64.0,99.0,121.0
45724,415489,American Crude,2008,Comedy,Drama,,96,USA,English,Craig Sheffer,"Jeff Winiski, Mike Diiorio",Kramer Management,"Raymond J. Barry, Aaron Brumfield, Gino Cabana...",The paths of several eccentric and troubled st...,3.7,1608,$ 10000000,,,,17.0,8.0


### Column: [duration]
- the duration column has the run time for the movie in minutes which we will convert to hours

In [178]:
# Check the data type
mv['duration'].dtype

dtype('int64')

In [179]:
mv['duration'].unique()

array([ 45,  70,  53, 100,  68,  60,  85, 120,  55, 121,  54,  96,  61,
        90,  50,  88,  72,  78, 148,  52, 124,  59,  63,  84,  65,  81,
       199,  74,  80,  82,  67,  56, 195,  77,  71,  46, 421, 105,  57,
        58,  73,  64,  62, 163, 300, 116,  69, 125,  97, 138, 112,  91,
        87,  48,  83, 136,  75,  94, 117,  93,  76, 418,  86, 223,  99,
       122, 207, 166,  95,  92,  66, 106, 145, 167,  79, 107, 109, 104,
        89, 102, 150, 131,  47, 119,  98, 110, 143, 137, 128, 101, 183,
       133, 135, 111, 141, 115, 153, 129, 123, 108, 155, 151, 170, 140,
       113, 114, 250, 132, 118, 160, 240, 142, 103, 144,  49, 127, 156,
       130, 165, 147, 152, 226, 227, 231, 218, 200, 210, 225, 243,  51,
       281, 208, 306, 257, 245, 255, 328, 168, 269, 303, 290, 212, 224,
       126, 260, 299, 264, 149, 211, 302, 238, 258, 263, 215, 265, 205,
       237, 220, 285, 293, 267, 216, 241,  43, 134, 184, 261, 197, 244,
       146, 177, 196, 154, 189, 219, 201, 180, 159, 192, 139, 17

In [181]:
# Convert the minutes in the duration column to hours
mv['duration'] = (mv['duration']/60).round(2)
mv['duration'].unique()

array([ 0.75,  1.17,  0.88,  1.67,  1.13,  1.  ,  1.42,  2.  ,  0.92,
        2.02,  0.9 ,  1.6 ,  1.02,  1.5 ,  0.83,  1.47,  1.2 ,  1.3 ,
        2.47,  0.87,  2.07,  0.98,  1.05,  1.4 ,  1.08,  1.35,  3.32,
        1.23,  1.33,  1.37,  1.12,  0.93,  3.25,  1.28,  1.18,  0.77,
        7.02,  1.75,  0.95,  0.97,  1.22,  1.07,  1.03,  2.72,  5.  ,
        1.93,  1.15,  2.08,  1.62,  2.3 ,  1.87,  1.52,  1.45,  0.8 ,
        1.38,  2.27,  1.25,  1.57,  1.95,  1.55,  1.27,  6.97,  1.43,
        3.72,  1.65,  2.03,  3.45,  2.77,  1.58,  1.53,  1.1 ,  1.77,
        2.42,  2.78,  1.32,  1.78,  1.82,  1.73,  1.48,  1.7 ,  2.5 ,
        2.18,  0.78,  1.98,  1.63,  1.83,  2.38,  2.28,  2.13,  1.68,
        3.05,  2.22,  2.25,  1.85,  2.35,  1.92,  2.55,  2.15,  2.05,
        1.8 ,  2.58,  2.52,  2.83,  2.33,  1.88,  1.9 ,  4.17,  2.2 ,
        1.97,  2.67,  4.  ,  2.37,  1.72,  2.4 ,  0.82,  2.12,  2.6 ,
        2.17,  2.75,  2.45,  2.53,  3.77,  3.78,  3.85,  3.63,  3.33,
        3.5 ,  3.75,

#### Rename the column to duration(hours)

In [185]:
# Rename the column to duration(hours)
mv.rename(columns={'duration': 'duration(hours)'},inplace=True)
mv[['duration(hours)']].sample(10)

Unnamed: 0,duration(hours)
81085,2.37
64744,1.67
67134,1.5
63350,1.4
32027,1.8
7746,1.45
82400,1.82
57538,1.65
53356,1.48
60713,1.43
