# IMDB Movie Dataset Cleaning

### Import Pandas Library

In [171]:
import pandas as pd

### Read the data file

In [172]:
df = pd.read_csv(r"datasource/imdb_movies.csv", encoding='unicode_escape', low_memory=False)
df.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
12957,tt0061433,C'era una volta,C'era una volta,1967,1967-10-19,"Comedy, Fantasy, Romance",115,"Italy, France","Italian, French",Francesco Rosi,"Tonino Guerra, Raffaele La Capria",Cinecittà,"Sophia Loren, Omar Sharif, Georges Wilson, Les...","In 17th century Italy, a Spanish prince, who's...",6.2,1158,,,,,17.0,11.0
50651,tt0841138,Osso Bucco,Osso Bucco,2008,2008-04-01,"Comedy, Romance",86,USA,English,"Fred Blurton, Gary Taylor",Gary Taylor,River West Films,"Mike Starr, Illeana Douglas, Christian Stolte,...",Trapped in his favorite restaurant during the ...,5.6,107,$ 1100000,,,,1.0,


### Set the column width to maximum to display all the columns

In [173]:
# Display all the columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)
df.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0


### Analyze the dataset
- Check the data types
- Check the number of records and number of columns

In [174]:
rows, columns = df.shape
print(f"Total Rows present are : {rows}, and total Columns are : {columns}")

Total Rows present are : 85855, and total Columns are : 22


In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               84954 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

# Data Cleaning Process

### Creating copy of the data

In [176]:
mv = df.copy()
mv.sample(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
19580,tt0081262,Odnazhdy dvadtsat let spustya,Odnazhdy dvadtsat let spustya,1981,1981-01-05,Comedy,77,Soviet Union,Russian,Yuri Yegorov,"Arkadiy Inin, Yuri Yegorov",Kinostudiya imeni M. Gorkogo,"Natalya Gundareva, Viktor Proskurin, Evgeniy L...",After 20 years classmates get together and dis...,7.3,377,,,,,,1.0
77269,tt5270810,Zemletryasenie,Zemletryasenie,2016,2016-09-13,Drama,101,"Armenia, Russia","Armenian, Russian, French",Sarik Andreasyan,"Grant Barsegyan, Arsen Danielyan",Mars Media Entertainment,"Konstantin Lavronenko, Mariya Mironova, Viktor...","Leninakan, Armenian SSR, USSR, 1988. After the...",6.3,1390,AMD 1460000000,,$ 3910254,,2.0,4.0


### Column: [imdb_title_id]

In [177]:
mv['imdb_title_id'].unique

<bound method Series.unique of 0         tt0000009
1         tt0000574
2         tt0001892
3         tt0002101
4         tt0002130
5         tt0002199
6         tt0002423
7         tt0002445
8         tt0002452
9         tt0002461
10        tt0002646
11        tt0002844
12        tt0003014
13        tt0003037
14        tt0003102
15        tt0003131
16        tt0003165
17        tt0003167
18        tt0003419
19        tt0003471
20        tt0003489
21        tt0003637
22        tt0003643
23        tt0003657
24        tt0003740
25        tt0003772
26        tt0003883
27        tt0003930
28        tt0003973
29        tt0004026
30        tt0004066
31        tt0004099
32        tt0004134
33        tt0004150
34        tt0004181
35        tt0004457
36        tt0004465
37        tt0004635
38        tt0004681
39        tt0004707
40        tt0004712
41        tt0004743
42        tt0004766
43        tt0004825
44        tt0004838
45        tt0004872
46        tt0004873
47        tt0004972
48       

- We can see that all the values are prefixed with 'tt' which can be removed 
- Also the column name can be renamed to 'id'

In [178]:
# Strip the 'tt' from the imdb_title_id
mv['imdb_title_id'] = mv['imdb_title_id'].str.strip('tt')
mv['imdb_title_id'].sample(10)

69940     3123140
36149     0202832
29668     0116141
17648     0075040
47600     0457391
6126      0041267
54194    11187484
18246     0076843
82451     7382828
61315     1725969
Name: imdb_title_id, dtype: object

#### Rename the column to 'imdb_title_id' to 'id'

In [179]:
# Rename column 'imdb_title_id' to 'id'
mv.rename(columns={'imdb_title_id':'id'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
1753,24894,The Black Cat,The Black Cat,1934,1934-05-07,"Adventure, Crime, Horror",65,USA,"English, Latin, Hungarian",Edgar G. Ulmer,"Edgar Allan Poe, Peter Ruric",Universal Pictures,"Boris Karloff, Bela Lugosi, David Manners, Jul...",American honeymooners in Hungary become trappe...,7.0,9155,$ 95745,,,,154.0,93.0
61672,1754629,The Final Night and Day,The Final Night and Day,2011,2011-03-26,"Action, Adventure, Comedy",77,USA,English,"Adam R. Steigert, Stephanie Wlosinski","Mark Mendola, Adam R. Steigert",DefTone Pictures Studios,"Richard Satterwhite, Daniel George, Kyle Andre...",A group of convicts have found themselves stuc...,5.4,146,$ 20000,,,,8.0,


#### Convert the column 'imdb_title_id' to integer type

In [180]:
# Convert the column 'imdb_title_id' to integer type
mv['id'] = mv['id'].astype(int)
mv['id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 85855 entries, 0 to 85854
Series name: id
Non-Null Count  Dtype
--------------  -----
85855 non-null  int64
dtypes: int64(1)
memory usage: 670.9 KB


### Column: [title] and [original_title]

In [181]:
# Check the unique values
df['title'].unique

- The 'title' column does not has any data anomaly and does not require cleaning

In [None]:
# Check the unique values in 'original_title'
df['original_title'].unique

<bound method Series.unique of 0                                               Miss Jerry
1                              The Story of the Kelly Gang
2                                           Den sorte drøm
3                                                Cleopatra
4                                                L'Inferno
5        From the Manger to the Cross; or, Jesus of Naz...
6                                           Madame DuBarry
7                                               Quo Vadis?
8                                    Independenta Romaniei
9                                              Richard III
10                                                Atlantis
11                   Fantômas - À l'ombre de la guillotine
12                                           Ingeborg Holm
13                                    Juve contre Fantômas
14                              Ma l'amor mio non muore...
15                                  Maudite soit la guerre
16                       

- Both the columns are same and the column 'title' can be dropped

#### Drop the column 'title'

In [None]:
# Drop the column title
mv.drop(columns=['title'],inplace=True)

#### Rename the column 'original_title' to 'title'

In [None]:
# Rename the column 'original_title' to 'title' 
mv.rename(columns={'original_title':'title'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
72697,3804756,Mothers of the Bride,2015,2015-03-28,"Comedy, Drama",89,USA,English,Sam Irvin,Nina Weinman,Down the Aisle Movie Productions,"Gail O'Grady, Betsy Brandt, Daniela Bobadilla,...","Jenna is getting married, her mother is deligh...",4.7,253,,,,,1.0,2.0
61085,1707384,Kôkô debyû,2011,2011-04-01,"Comedy, Romance",93,Japan,Japanese,Tsutomu Hanabusa,"Kazune Kawahara, Yûichi Fukuda",Asmik Ace Entertainment,"Junpei Mizobata, Ito Ono, Masaki Suda, Rina Ai...",Haruna who devoted herself to softball decide ...,6.1,522,,,$ 3242697,,4.0,1.0


### Column: [year] and [date_published]
- We can drop the 'year' column
- from the 'date_published' column we will extract the year and further rename the column to 'year'

In [None]:
# Drop the column 'year'
mv.drop(columns=['year'],inplace=True)

#### Extract year from the column 'date_published'

In [None]:
# Convert the column 'published_year' to date_time
mv['date_published'] = mv['date_published'].str[:4]

In [None]:
mv['date_published'].sample(5)

53818    2008
32289    1964
39756    2001
53944    2009
28705    1995
Name: date_published, dtype: object

#### Rename the column 'date_published' to 'year'

In [None]:
mv.rename(columns={'date_published': 'year'}, inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
38498,253959,Yakisikli,1987,Comedy,86,Turkey,Turkish,Orhan Aksoy,Orhan Aksoy,,"Kemal Sunal, Aysegül Uyguner, Turgay Aksoy, Tu...","Selim has a radio and he calls himself ""Handso...",6.2,1284,,,,,,
40533,290423,Dupa-amiaza unui tortionar,2002,"Biography, Comedy, Drama",80,"Romania, France",Romanian,Lucian Pintilie,"Doina Jela, Lucian Pintilie",Filmex,"Gheorghe Dinica, Radu Beligan, Ioana Ana Macar...",A young journalist interviews a man who was a ...,7.6,698,,,$ 12646,,6.0,1.0


### Column: [genre]

In [None]:
mv['genre'].unique()

array(['Romance', 'Biography, Crime, Drama', 'Drama', ..., 'Family, War',
       'Documentary', 'Crime, War'], dtype=object)

#### Genre column contains multiple genres for a single movie, but it must be noted that:
- for each movie we have a maximum of 3 genres
- we can split these multiple genres into three columns 'Genre 1' , 'Genre 2', and 'Genre 3'
- The splitted values can be assigned to the three columns 'Genre 1' , 'Genre 2', and 'Genre 3' respectively

In [None]:
# create a temp df 'genre_split' to store the splitted genres
genre_split  = mv['genre'].str.split(',', expand = True)

In [None]:
# Rename the columns to Genre1, Genre2, Genre3
genre_cols = ['genre1', 'genre2', 'genre3']
genre_split.columns = genre_cols

In [None]:
# Concatenate the split genre df to original df
for i in range(len(genre_cols)):
    mv.insert(loc=mv.columns.get_loc('genre') + 1 + i, column=genre_cols[i], value=genre_split.iloc[:, i])
mv.sample(2)

Unnamed: 0,id,title,year,genre,Genre1,Genre2,Genre3,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
7773,46362,Star of India,1954,"Adventure, Romance",Adventure,Romance,,92,"UK, Italy",English,Arthur Lubin,"Herbert Dalmas, Denis Freeman",Titanus,"Cornel Wilde, Jean Wallace, Herbert Lom, Yvonn...",Squire Pierre St. Laurent returns from wars in...,5.6,111,,,,,4.0,1.0
14357,65462,Beneath the Planet of the Apes,1970,"Action, Adventure, Sci-Fi",Action,Adventure,Sci-Fi,95,USA,English,Ted Post,"Paul Dehn, Mort Abrahams",Twentieth Century Fox,"James Franciscus, Kim Hunter, Maurice Evans, L...",The sole survivor of an interplanetary rescue ...,6.1,41159,$ 3000000,$ 18999718,$ 18999718,46.0,182.0,73.0


In [None]:
# Drop the column genre 
mv.drop(columns=['genre'],inplace=True)
mv.sample(2)

Unnamed: 0,id,title,year,Genre1,Genre2,Genre3,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
54828,1161449,The Sanctuary,2009,Action,,,87,Thailand,"Thai, English",Thanapon Maliwan,"Anuwat Kaewsopark, Thanapon Maliwan",Film Frame Productions,"Michael B., Russell Wong, Inthira Charoenpura,...","In 1897, the ""Poisoned Knife"" clan broke into ...",4.2,220,,,$ 92904,,3.0,12.0
60254,1637681,Flowers,2010,Drama,,,110,Japan,Japanese,Norihiro Koizumi,"Shu Fujimoto, Yuiko Miura",ADK,"Yû Aoi, Kyôka Suzuki, Yûko Takeuchi, Rena Tana...",A story of six women from three different gene...,6.6,279,,,$ 4250257,,7.0,10.0
