# IMDB Movie Dataset Cleaning

### Import Pandas Library

In [47]:
import pandas as pd

### Read the data file

In [48]:
df = pd.read_csv(r"datasource/imdb_movies.csv", encoding='unicode_escape', low_memory=False)
df.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
43536,tt0364447,Jopog manura 2: Dolaon jeonseol,Jopog manura 2: Dolaon jeonseol,2003,2003-09-05,"Action, Comedy, Crime",110,South Korea,Korean,Heung-sun Jeong,"Hae-cheol Choi, Heung-sun Jeong",CJ Entertainment,"Eun-Kyung Shin, Jun Gyu Park, Se-jin Jang, Hye...",Former Scissor Gang leader Eun-jin (Shin Eun-k...,6.1,1178,,,,,6.0,9.0
18526,tt0077827,Lady Oscar,Lady Oscar,1979,1979-03-03,"Drama, History, Romance",124,"Japan, France","English, German",Jacques Demy,"Jacques Demy, Patricia Louisianna Knop",Kitty Films,"Catriona MacColl, Barry Stokes, Patrick Allen,...","The story of Lady Oscar, a female military com...",6.0,461,,,,,16.0,4.0


### Set the column width to maximum to display all the columns

In [49]:
# Display all the columns and rows
pd.set_option('display.max_columns',None)
df.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0


### Analyze the dataset
- Check the data types
- Check the number of records and number of columns

In [50]:
rows, columns = df.shape
print(f"Total Rows present are : {rows}, and total Columns are : {columns}")

Total Rows present are : 85855, and total Columns are : 22


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               84954 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

# Data Cleaning Process

### Creating copy of the data

In [52]:
mv = df.copy()
mv.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
14341,tt0065422,Arizona si scatenò... e li fece fuori tutti!,Arizona si scatenò... e li fece fuori tutti!,1970,1970-08-14,Western,90,"Italy, Spain",Italian,Sergio Martino,"Ernesto Gastaldi, Joaquín Luis Romero Marchent",Astro C.C.,"Anthony Steffen, Marcella Michelangeli, Aldo S...",Famed gunman Arizona Colt is sent by Moreno to...,5.7,233,,,,,5.0,8.0
7879,tt0046750,Gli ultimi crisantemi,Bangiku,1954,1954-06-15,Drama,101,Japan,Japanese,Mikio Naruse,"Fumiko Hayashi, Sumie Tanaka",Toho Company,"Haruko Sugimura, Sadako Sawamura, Chikako Hoso...",What is the life of a Geisha like once her bea...,7.4,889,,,,,10.0,16.0


### Column: [imdb_title_id]

In [53]:
mv['imdb_title_id'].unique

<bound method Series.unique of 0        tt0000009
1        tt0000574
2        tt0001892
3        tt0002101
4        tt0002130
           ...    
85850    tt9908390
85851    tt9911196
85852    tt9911774
85853    tt9914286
85854    tt9914942
Name: imdb_title_id, Length: 85855, dtype: object>

- We can see that all the values are prefixed with 'tt' which can be removed 
- Also the column name can be renamed to 'id'

In [54]:
# Strip the 'tt' from the imdb_title_id
mv['imdb_title_id'] = mv['imdb_title_id'].str.strip('tt')
mv['imdb_title_id'].sample(10)

27556    0107773
16729    0072037
17292    0073905
20838    0085394
2842     0029648
21933    0089123
16884    0072644
42308    0329601
17540    0074739
75585    4728582
Name: imdb_title_id, dtype: object

#### Rename the column to 'imdb_title_id' to 'id'

In [55]:
# Rename column 'imdb_title_id' to 'id'
mv.rename(columns={'imdb_title_id':'id'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
68303,2671106,The Hanover House,The Hanover House,2014,2014-05-09,"Horror, Thriller",73,USA,English,Corey Norman,"Corey Norman, Corey Norman",Bonfire Films,"Brian Chamberlain, Casey Turner, Anne Bobby, D...","Returning from his father's funeral, Robert Fo...",5.6,112,$ 500000,,,,3.0,11.0
20074,82869,Atmosfera zero,Outland,1981,1981-10-08,"Action, Crime, Sci-Fi",109,UK,English,Peter Hyams,Peter Hyams,The Ladd Company,"Sean Connery, Peter Boyle, Frances Sternhagen,...",A federal marshal stationed at a mining colony...,6.6,26486,$ 16000000,$ 17374595,$ 17374595,48.0,147.0,94.0


#### Convert the column 'imdb_title_id' to integer type

In [56]:
# Convert the column 'imdb_title_id' to integer type
mv['id'] = mv['id'].astype(int)
mv['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 85855 entries, 0 to 85854
Series name: id
Non-Null Count  Dtype
--------------  -----
85855 non-null  int64
dtypes: int64(1)
memory usage: 670.9 KB


### Column: [title] and [original_title]

In [57]:
# Check the unique values
df['title'].unique

<bound method Series.unique of 0                            Miss Jerry
1           The Story of the Kelly Gang
2                        Den sorte drøm
3                             Cleopatra
4                             L'Inferno
                      ...              
85850                           Le lion
85851    De Beentjes van Sint-Hildegard
85852         Padmavyuhathile Abhimanyu
85853                 Sokagin Çocuklari
85854        La vida sense la Sara Amat
Name: title, Length: 85855, dtype: object>

- The 'title' column does not has any data anomaly and does not require cleaning

In [58]:
# Check the unique values in 'original_title'
df['original_title'].unique

<bound method Series.unique of 0                            Miss Jerry
1           The Story of the Kelly Gang
2                        Den sorte drøm
3                             Cleopatra
4                             L'Inferno
                      ...              
85850                           Le lion
85851    De Beentjes van Sint-Hildegard
85852         Padmavyuhathile Abhimanyu
85853                 Sokagin Çocuklari
85854        La vida sense la Sara Amat
Name: original_title, Length: 85855, dtype: object>

- Both the columns are same and the column 'title' can be dropped

#### Drop the column 'title'

In [59]:
# Drop the column title
mv.drop(columns=['title'],inplace=True)

#### Rename the column 'original_title' to 'title'

In [60]:
# Rename the column 'original_title' to 'title' 
mv.rename(columns={'original_title':'title'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
53839,1092633,"The Goods: Live Hard, Sell Hard",2009,2009-08-14,Comedy,89,USA,English,Neal Brennan,"Andy Stock, Rick Stempson",Paramount Vantage,"Jeremy Piven, Ving Rhames, James Brolin, David...",Used-car liquidator Don Ready is hired by a fl...,5.8,20547,$ 10000000,$ 15122676,$ 15300885,39.0,78.0,81.0
32098,130377,Zatôichi senryô-kubi,1964,1964-03-14,"Action, Adventure, Drama",83,Japan,Japanese,Kazuo Ikehiro,"Shôzaburô Asai, Akikazu Ota",Daiei,"Shintarô Katsu, Shôgo Shimada, Mikiko Tsubouch...",Zatoichi is mistaken for a thief. To clear his...,7.4,1374,,,,,12.0,9.0


### Column: [year] and [date_published]
- We can drop the 'year' column
- from the 'date_published' column we will extract the year and further rename the column to 'year'

In [62]:
# Drop the column 'year'
mv.drop(columns=['year'],inplace=True)

#### Extract year from the column 'date_published'

In [66]:
# Convert the column 'published_year' to date_time
mv['date_published'] = mv['date_published'].str[:4]

In [69]:
mv['date_published'].sample(5)

4176     1942
46944    2008
75615    2005
79637    2017
6882     1951
Name: date_published, dtype: object

#### Rename the column 'date_published' to 'year'

In [73]:
mv.rename(columns={'date_published': 'year'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
1111,22000,Indiscreet,1931,"Comedy, Drama, Musical",92,USA,English,Leo McCarey,"Buddy G. DeSylva, Lew Brown",Joseph M. Schenck Productions,"Gloria Swanson, Ben Lyon, Monroe Owsley, Barba...",A young woman jeopardizes the relationship wit...,6.2,1336,,,,,23.0,2.0
51844,964179,B-Girl,2009,"Drama, Music, Romance",88,USA,English,Emily Dell,Emily Dell,Two Camels Films,"Julie 'Jules' Urich, Missy Yager, Wesley Jonat...",A story about a female breakdancer overcoming ...,3.9,618,,,,,10.0,7.0


### Column: [genre]

In [76]:
mv['genre'].unique

<bound method Series.unique of 0                          Romance
1          Biography, Crime, Drama
2                            Drama
3                   Drama, History
4        Adventure, Drama, Fantasy
                   ...            
85850                       Comedy
85851                Comedy, Drama
85852                        Drama
85853                Drama, Family
85854                        Drama
Name: genre, Length: 85855, dtype: object>