# IMDB Movie Dataset Cleaning

### Import Pandas Library

In [55]:
import pandas as pd

### Read the data file

In [56]:
df = pd.read_csv(r"datasource/imdb_movies.csv", encoding='unicode_escape', low_memory=False)
df.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
75666,tt4764370,"Guillaume, la jeunesse du conquérant","Guillaume, la jeunesse du conquérant",2015,2017-05-31,"Drama, History",90,France,French,Fabien Drugeon,Fabien Drugeon,Les Films Du Cartel,"Tiésay Deshayes, Jean-Damien Détouillon, Dan B...","In the year of 1066AC, William the Conqueror i...",4.2,104,,,,,3.0,1.0
32376,tt0135492,I kafetzou,I kafetzou,1956,1956-10-29,Comedy,89,Greece,Greek,Alekos Sakellarios,"Giorgos Asimakopoulos, Alekos Sakellarios",Finos Film,"Georgia Vasileiadou, Mimis Fotopoulos, Vasilis...",A former laundry old lady cooperates with a co...,7.2,391,,,,,,


### Set the column width to maximum to display all the columns

In [57]:
# Display all the columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)
df.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0


### Analyze the dataset
- Check the data types
- Check the number of records and number of columns

In [58]:
rows, columns = df.shape
print(f"Total Rows present are : {rows}, and total Columns are : {columns}")

Total Rows present are : 85855, and total Columns are : 22


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               84954 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

# Data Cleaning Process

### Creating copy of the data

In [60]:
mv = df.copy()
mv.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
11136,tt0056135,Karami-ai,Karami-ai,1962,1962-02-17,Drama,107,Japan,Japanese,Masaki Kobayashi,"Kôichi Inagaki, Norio Nanjo",Shochiku,"Keiko Kishi, Tatsuya Nakadai, Sô Yamamura, Sei...",A dying businessman intends to will two hundre...,7.3,639,,,,,6.0,14.0
10078,tt0053182,Porgy and Bess,Porgy and Bess,1959,1959-12-17,"Drama, Musical, Romance",138,USA,English,"Otto Preminger, Rouben Mamoulian","Dorothy Heyward, DuBose Heyward",The Samuel Goldwyn Company,"Sidney Poitier, Dorothy Dandridge, Sammy Davis...",A woman whose past is scorned by nearly everyo...,7.1,1485,$ 7000000,,,,64.0,10.0


### Column: [imdb_title_id]

In [61]:
mv['imdb_title_id'].unique()

array(['tt0000009', 'tt0000574', 'tt0001892', ..., 'tt9911774',
       'tt9914286', 'tt9914942'], dtype=object)

- We can see that all the values are prefixed with 'tt' which can be removed 
- Also the column name can be renamed to 'id'

In [62]:
# Strip the 'tt' from the imdb_title_id
mv['imdb_title_id'] = mv['imdb_title_id'].str.strip('tt')
mv[['imdb_title_id']].sample(10)

Unnamed: 0,imdb_title_id
26741,104816
35199,184894
1643,24429
5459,39204
31932,127919
40590,291272
17042,73107
8652,49084
29090,113711
73286,3966404


#### Rename the column to 'imdb_title_id' to 'id'

In [63]:
# Rename column 'imdb_title_id' to 'id'
mv.rename(columns={'imdb_title_id':'id'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
44253,378906,Maghi e viaggiatori,Travellers & Magicians,2003,2004-04-23,"Adventure, Drama",108,"Australia, Bhutan, UK",Dzongkha,Khyentse Norbu,Khyentse Norbu,Mission Film,"Tshewang Dendup, Gup Kado Duba, Dasho Kado, So...","The two men embark on parallel, if separate, j...",7.4,2078,$ 1800000,$ 506793,$ 696253,71.0,31.0,45.0
46044,421229,Mrs. Palfrey at the Claremont,Mrs. Palfrey at the Claremont,2005,2008-05-15,"Comedy, Drama",108,"UK, USA",English,Dan Ireland,"Martin Donovan, Dan Ireland",Cineville,"Joan Plowright, Rupert Friend, Zoë Tapper, Rob...",All but abandoned by her family in a London re...,7.6,3192,$ 750000,$ 1720953,$ 4009677,67.0,53.0,28.0


#### Convert the column 'imdb_title_id' to integer type

In [64]:
# Convert the column 'imdb_title_id' to integer type
mv['id'] = mv['id'].astype(int)
mv['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 85855 entries, 0 to 85854
Series name: id
Non-Null Count  Dtype
--------------  -----
85855 non-null  int64
dtypes: int64(1)
memory usage: 670.9 KB


### Column: [title] and [original_title]

In [65]:
# Check the unique values
df['title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- The 'title' column does not has any data anomaly and does not require cleaning

In [66]:
# Check the unique values in 'original_title'
df['original_title'].unique()

array(['Miss Jerry', 'The Story of the Kelly Gang', 'Den sorte drøm', ...,
       'Padmavyuhathile Abhimanyu', 'Sokagin Çocuklari',
       'La vida sense la Sara Amat'], dtype=object)

- Both the columns are same and the column 'title' can be dropped

#### Drop the column 'title'

In [67]:
# Drop the column title
mv.drop(columns=['title'],inplace=True)

#### Rename the column 'original_title' to 'title'

In [68]:
# Rename the column 'original_title' to 'title' 
mv.rename(columns={'original_title':'title'}, inplace=True)
mv[['title']].sample(10)

Unnamed: 0,title
40715,The Humanist
23404,Nejistá sezóna
58428,Hottarake no shima - Haruka to maho no kagami
78488,The Son of Bigfoot
62786,Çalgi Çengi
47292,Wonder Woman
19430,The Ghost Dance
21705,Streets of Fire
72309,Tordenskjold & Kold
84881,Pearl


### Column: [year] and [date_published]
- We can drop the 'year' column
- from the 'date_published' column we will extract the year and further rename the column to 'year'

In [69]:
# Drop the column 'year'
mv.drop(columns=['year'],inplace=True)

#### Extract year from the column 'date_published'

In [70]:
# Convert the column 'published_year' to date_time
mv['date_published'] = mv['date_published'].str[:4]

In [71]:
mv[['date_published']].sample(5)

Unnamed: 0,date_published
45129,2003
22691,1986
7429,1952
71728,2014
36713,2000


#### Rename the column 'date_published' to 'year'

In [72]:
mv.rename(columns={'date_published': 'year'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
9492,51554,Dracula,1958,"Drama, Horror",82,UK,English,Terence Fisher,"Jimmy Sangster, Bram Stoker",Hammer Films,"Peter Cushing, Christopher Lee, Michael Gough,...",Jonathan Harker begets the ire of Count Dracul...,7.3,20275,GBP 81000,,,,219.0,115.0
41327,305961,Perníková vez,2002,Crime,101,Czech Republic,Czech,Milan Steindler,"Martin Nemec, Martin Nemec",,"Jan Dolanský, Radek Kuchar, Michal Docolomansk...",Girlfriend of young Jakub is found dead and wi...,6.0,279,,,$ 249325,,3.0,


### Column: [genre]

In [73]:
mv[['genre']].sample(10)

Unnamed: 0,genre
51707,Drama
46742,"Comedy, Drama, Romance"
54006,Comedy
45533,"Drama, Horror"
27599,Horror
56876,Comedy
24973,"Crime, Mystery, Thriller"
16651,"Crime, Thriller"
32819,"Animation, Action, Drama"
24857,"Comedy, Drama"


#### Genre column contains multiple genres for a single movie, but it must be noted that:
- for each movie we have a maximum of 3 genres
- we can split these multiple genres into three columns 'Genre 1' , 'Genre 2', and 'Genre 3'
- The splitted values can be assigned to the three columns 'Genre 1' , 'Genre 2', and 'Genre 3' respectively

In [74]:
# create a temp df 'genre_split' to store the splitted genres
genre_split  = mv['genre'].str.split(',', expand = True)

In [75]:
# Rename the columns to Genre1, Genre2, Genre3
genre_cols = ['genre1', 'genre2', 'genre3']
genre_split.columns = genre_cols

In [76]:
# Concatenate the split genre df to original df
for i in range(len(genre_cols)):
    mv.insert(loc=mv.columns.get_loc('genre') + 1 + i, column=genre_cols[i], value=genre_split.iloc[:, i])
mv[['genre1','genre2','genre3']].sample(10)

Unnamed: 0,genre1,genre2,genre3
61447,Crime,Drama,
70050,Horror,,
47085,Comedy,Drama,Thriller
12444,Drama,Family,
72597,Drama,,
28156,Action,Horror,Sci-Fi
19116,Comedy,Drama,Music
29555,Comedy,Drama,
41138,Comedy,Drama,Romance
47880,Comedy,Fantasy,Music


In [77]:
# Drop the column genre 
mv.drop(columns=['genre'],inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre1,genre2,genre3,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
66520,2347569,Frances Ha,2014,Comedy,Drama,Romance,86,USA,"English, French",Noah Baumbach,"Noah Baumbach, Greta Gerwig",Pine District Pictures,"Greta Gerwig, Mickey Sumner, Michael Esper, Ad...",A New York woman (who doesn't really have an a...,7.5,71244,,$ 4069826,$ 9013161,82.0,192.0,349.0
63779,2004279,Les adoptés,2011,Drama,,,100,France,French,Mélanie Laurent,"Mélanie Laurent, Morgan Perez",Move Movie,"Marie Denarnaud, Denis Ménochet, Clémentine Cé...",Lisa and her adopted sister Marine are insepar...,6.8,1429,,,$ 1255727,,3.0,27.0


### Column: [duration]
- the duration column has the run time for the movie in minutes which we will convert to hours

In [78]:
# Check the data type
mv['duration'].dtype

dtype('int64')

In [79]:
mv['duration'].unique()

array([ 45,  70,  53, 100,  68,  60,  85, 120,  55, 121,  54,  96,  61,
        90,  50,  88,  72,  78, 148,  52, 124,  59,  63,  84,  65,  81,
       199,  74,  80,  82,  67,  56, 195,  77,  71,  46, 421, 105,  57,
        58,  73,  64,  62, 163, 300, 116,  69, 125,  97, 138, 112,  91,
        87,  48,  83, 136,  75,  94, 117,  93,  76, 418,  86, 223,  99,
       122, 207, 166,  95,  92,  66, 106, 145, 167,  79, 107, 109, 104,
        89, 102, 150, 131,  47, 119,  98, 110, 143, 137, 128, 101, 183,
       133, 135, 111, 141, 115, 153, 129, 123, 108, 155, 151, 170, 140,
       113, 114, 250, 132, 118, 160, 240, 142, 103, 144,  49, 127, 156,
       130, 165, 147, 152, 226, 227, 231, 218, 200, 210, 225, 243,  51,
       281, 208, 306, 257, 245, 255, 328, 168, 269, 303, 290, 212, 224,
       126, 260, 299, 264, 149, 211, 302, 238, 258, 263, 215, 265, 205,
       237, 220, 285, 293, 267, 216, 241,  43, 134, 184, 261, 197, 244,
       146, 177, 196, 154, 189, 219, 201, 180, 159, 192, 139, 17

In [80]:
# Convert the minutes in the duration column to hours
mv['duration'] = (mv['duration']/60).round(2)
mv[['duration']].sample(5)

Unnamed: 0,duration
21815,1.52
67375,1.98
37395,1.53
35740,1.82
30718,2.3


#### Rename the column to duration(hours)

In [81]:
# Rename the column to duration(hours)
mv.rename(columns={'duration': 'duration(hours)'},inplace=True)
mv[['duration(hours)']].sample(10)

Unnamed: 0,duration(hours)
68447,1.75
1510,1.42
34074,1.68
79213,1.78
35854,1.4
69336,1.92
29628,1.92
65837,1.58
20293,1.9
25237,1.58


### Column: [Country]

In [82]:
# Check the data type for Country column
mv['country'].sample(10)

30169                                   USA
83067                               Nigeria
18511                                France
43847                           South Korea
33257                                 Japan
50815                           Norway, USA
25458    Switzerland, Argentina, Japan, USA
26454                                    UK
30563                                   USA
29596                                France
Name: country, dtype: object

In [83]:
# clean the data to keep only one country for each movie
mv['country'] = mv['country'].str.split(',').str[0]
mv[['country']].sample(10)

Unnamed: 0,country
27236,Russia
12862,USA
28359,Hong Kong
25659,Finland
32078,Denmark
46236,Poland
61200,USA
82075,France
21731,UK
63915,USA


In [84]:
# fill in the missing values for country names
mv['country'] = mv['country'].fillna('Not Available')
mv[['country']].sample(10)

Unnamed: 0,country
6227,USA
52374,USA
19718,USA
31690,USA
34155,Italy
81314,USA
41827,France
63235,USA
6192,USA
68837,Republic of North Macedonia


### Column: [Language]

In [85]:
# Check the data type for the column language
mv['language'].unique()

array([nan, 'English', 'Italian', ..., 'Persian, Urdu',
       'English, Swiss German, German',
       'English, Polish, Russian, German'], dtype=object)

In [86]:
# Fill the missing values with 'Not Available' 
mv['language'] = mv['language'].fillna('Not Available')

In [87]:
# Remove 'None' from the language column
mv['language'] = mv['language'].str.replace('None','')
mv['language'].str.strip(',')

0                                            Not Available
1                                            Not Available
2                                            Not Available
3                                                  English
4                                                  Italian
5                                                  English
6                                                   German
7                                                  Italian
8                                            Not Available
9                                                  English
10                                                  Danish
11                                                  French
12                                           Not Available
13                                                  French
14                                                 Italian
15                                                  French
16                                                  Fren

In [88]:
# Check the cleaned column data
mv[['language']].sample(10)

Unnamed: 0,language
53591,Bengali
21040,"English, Russian, German, Spanish, Hindi"
66632,Hindi
910,"English, Italian"
22750,"English, Spanish"
15249,English
63318,English
21753,French
9557,English
44828,German


### Column: [writer], [production_company], [actors], [production_company] and [director]

In [89]:
# Check the data type for the columns
mv[['writer', 'production_company', 'actors']].dtypes

writer                object
production_company    object
actors                object
dtype: object

In [90]:
# fill in the missing values for writer names
mv['writer'] = mv['writer'].fillna('Not Available')
mv[['writer']].sample(10)

Unnamed: 0,writer
35194,"Giorgos Lazaridis, Nikos Tsiforos"
56915,Andrew Traucki
67756,Mike Sikowitz
30584,"Alain Godard, Jacques Monnet"
38964,Bülent Oran
26993,"George Putnam, John Katchmer"
2842,"Wyllis Cooper, Norman Foster"
15933,"Carlo Veo, Giuseppe Rosati"
15397,"Jaroslaw Iwaszkiewicz, Andrzej Wajda"
37396,"José Giovanni, José Giovanni"


In [91]:
# fill in the missing values for production_company 
mv['production_company'] = mv['production_company'].fillna('Not Available')
mv[['production_company']].sample(10)

Unnamed: 0,production_company
57837,Made It Myself Pictures
46060,Telespan 2000
80057,Sanctum Film
14790,Krasne Entertainments
22992,Instituto Nacional de Cine y Artes Audiovisual...
32128,Kinostudiya imeni M. Gorkogo
44835,BV Entertainment Inc.
66575,Rakeysh Omprakash Mehra Pictures
5259,Metro-Goldwyn-Mayer (MGM)
36245,Tallinnfilm


In [92]:
# fill in the missing values for writer names
mv['actors'] = mv['actors'].fillna('Not Available')
mv[['actors']].sample(10)

Unnamed: 0,actors
72510,"Alan Ritchson, Darin Brooks, James Cade, Rob R..."
78569,"Radhika Apte, Amrita Bagchi, Nivedita Bhattach..."
57129,"Matilda Grahn, Teodor Runsiö, Tomas Norström, ..."
67718,"Minnie Driver, Paul Adelstein, Alfred Molina, ..."
16421,"Tatsuya Nakadai, Katsuyuki Itô, Aiko Nagayama,..."
50840,"Ken Ogata, Hana Sugiura, Saki Takaoka, Shôta M..."
29188,"Lloyd Adams, Robert Wuhl, Eric Bryson, Marcy K..."
6846,"Wendell Corey, Macdonald Carey, Ellen Drew, Wa..."
22898,"Jack Lemmon, Julie Andrews, Sally Kellerman, R..."
23766,"Kevin Costner, Susan Sarandon, Tim Robbins, Tr..."


In [93]:
# fill in the missing values for director 
mv['director'] = mv['director'].fillna('Not Available')
mv[['director']].sample(10)

Unnamed: 0,director
67087,Edgar Marie
18575,John Landis
28458,Robert Radler
68319,Thea Sharrock
39189,Robert Malenfant
8067,John Paddy Carstairs
36716,Knut Andersen
30778,Alexandre Rockwell
81679,Fausto Brizzi
24175,Ami Artzi


### Column: [avg_vote] and [votes]
- These columns contain the average votes and the total votes recieved 
- No anomalies found

In [94]:
# Check the data type for the columns
mv[['avg_vote', 'votes']].dtypes

avg_vote    float64
votes         int64
dtype: object

### Column: [budget]

In [95]:
mv['budget'].unique()

array([nan, '$ 2250', '$ 45000', ..., 'CAD 3850000', 'IRR 35000000000',
       'MYR 20000000'], dtype=object)

In [96]:
# Convert the $ to USD
mv['budget'] = mv['budget'].str.replace('$','USD')
mv[['budget']].sample(10)

Unnamed: 0,budget
23546,USD 17000000
68029,
66797,
9193,
20034,
17236,USD 10000000
30315,
82843,HUF 317000000
264,
7467,


### Columns: [usa_gross_income], [worldwide_gross_income] and [meta_score]

In [97]:
# Drop the columns
mv.drop(columns=['usa_gross_income', 'worlwide_gross_income', 'metascore'],inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre1,genre2,genre3,duration(hours),country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,reviews_from_users,reviews_from_critics
44719,387952,Le cou de la girafe,2004,Drama,,,1.4,France,"French, Italian, Spanish",Safy Nebbou,"Safy Nebbou, Danièle Thompson",Téléma,"Sandrine Bonnaire, Claude Rich, Louisa Pili, D...","Mathilde (Louisa Pili), a precocious 10-year-o...",6.5,331,,3.0,11.0
75224,4625324,Szatan kazal tanczyc,2017,Drama,,,1.62,Poland,"Polish, English",Katarzyna Roslaniec,Katarzyna Roslaniec,Manana,"Danuta Stenka, Tygo Gernandt, Marta Nieradkiew...","After the success of her best-selling novel, K...",3.7,196,,1.0,4.0


### Review the data

In [98]:
mv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    85855 non-null  int64  
 1   title                 85855 non-null  object 
 2   year                  85855 non-null  object 
 3   genre1                85855 non-null  object 
 4   genre2                58780 non-null  object 
 5   genre3                31232 non-null  object 
 6   duration(hours)       85855 non-null  float64
 7   country               85855 non-null  object 
 8   language              85855 non-null  object 
 9   director              85855 non-null  object 
 10  writer                85855 non-null  object 
 11  production_company    85855 non-null  object 
 12  actors                85855 non-null  object 
 13  description           83740 non-null  object 
 14  avg_vote              85855 non-null  float64
 15  votes              

### Output the cleaned data to csv

In [99]:
# Save the cleaned data frame to a csv file
mv.to_csv('../IMDB_Movie_Cleaning_Python/datasource/imdb_movie_cleaned_data.csv', index=False)