In [329]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [330]:
imdb_df = pd.read_csv("new_cleaned.csv")

In [331]:
imdb_df.head()

Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [332]:
# Categorize counts
def categorize_count(count):
    if 1 <= count <= 5:
        return 'low'
    elif 6 <= count <= 10:
        return 'average'
    else:
        return 'high'

## Director categorization

In [334]:
# Calculate value counts
director_counts = imdb_df['director'].value_counts().reset_index()
director_counts.columns = ['director', 'count']
director_counts

Unnamed: 0,director,count
0,Steven Spielberg,13
1,Martin Scorsese,10
2,Alfred Hitchcock,9
3,Christopher Nolan,8
4,Clint Eastwood,8
...,...,...
397,Jon Favreau,1
398,Yimou Zhang,1
399,Danis Tanovic,1
400,Shin'ichirô Watanabe,1


In [335]:
# Apply categorization
director_counts['director_category'] = star1_counts['count'].apply(categorize_count)

# Merge the director_counts counts back to the original DataFrame
imdb_df = imdb_df.merge(director_counts[['director', 'director_category']], on='director', how='left')

# Create DataFrames for each category
director_group_1_5 = imdb_df[imdb_df['director_category'] == 'low']
director_group_6_10 = imdb_df[imdb_df['director_category'] == 'average']
director_group_11_plus = imdb_df[imdb_df['director_category'] == 'high']

# print the DataFrames
print("Director 1 with 1-5 movies:")
display(director_group_1_5.head())

print("\nDirectors with 6-10 movies:")
display(director_group_6_10.head())

print("\nDirectors with 11 or more movies:")
display(director_group_11_plus.head())

Director 1 with 1-5 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0,low
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0,low
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0,low
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0,low
5,2003,U,201,"Action, Adventure, Drama",8.9,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905.0,low



Directors with 6-10 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0,average
6,1994,A,154,"Crime, Drama",8.9,94.0,Quentin Tarantino,John Travolta,Uma Thurman,Samuel L. Jackson,Bruce Willis,1826188,107928762.0,average
8,2010,UA,148,"Action, Adventure, Sci-Fi",8.8,74.0,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2067042,292576195.0,average
9,1999,A,139,Drama,8.8,66.0,David Fincher,Brad Pitt,Edward Norton,Meat Loaf,Zach Grenier,1854740,37030102.0,average
15,1990,A,146,"Biography, Crime, Drama",8.7,90.0,Martin Scorsese,Robert De Niro,Ray Liotta,Joe Pesci,Lorraine Bracco,1020727,46836394.0,average



Directors with 11 or more movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category
7,1993,A,195,"Biography, Drama, History",8.9,94.0,Steven Spielberg,Liam Neeson,Ralph Fiennes,Ben Kingsley,Caroline Goodall,1213505,96898818.0,high
22,1998,R,169,"Drama, War",8.6,91.0,Steven Spielberg,Tom Hanks,Matt Damon,Tom Sizemore,Edward Burns,1235804,216540909.0,high
62,1981,A,115,"Action, Adventure",8.4,85.0,Steven Spielberg,Harrison Ford,Karen Allen,Paul Freeman,John Rhys-Davies,884112,248159971.0,high
122,1989,U,127,"Action, Adventure",8.2,65.0,Steven Spielberg,Harrison Ford,Sean Connery,Alison Doody,Denholm Elliott,692366,197171806.0,high
163,2002,A,141,"Biography, Crime, Drama",8.1,75.0,Steven Spielberg,Leonardo DiCaprio,Tom Hanks,Christopher Walken,Martin Sheen,832846,164615351.0,high


## Star 1 categorization

In [337]:
# Calculate value counts
star1_counts = imdb_df['star1'].value_counts().reset_index()
star1_counts.columns = ['star1', 'count']
star1_counts

Unnamed: 0,star1,count
0,Tom Hanks,12
1,Al Pacino,10
2,Robert De Niro,10
3,Clint Eastwood,10
4,Leonardo DiCaprio,9
...,...,...
467,Marcello Mastroianni,1
468,Omar Sharif,1
469,Topol,1
470,Timothy Bottoms,1


In [338]:
# Apply categorization
star1_counts['star1_category'] = star1_counts['count'].apply(categorize_count)

# Merge the star1_counts counts back to the original DataFrame
imdb_df = imdb_df.merge(star1_counts[['star1', 'star1_category']], on='star1', how='left')

# Create DataFrames for each category
star1_group_1_5 = imdb_df[imdb_df['star1_category'] == 'low']
star1_group_6_10 = imdb_df[imdb_df['star1_category'] == 'average']
star1_group_11_plus = imdb_df[imdb_df['star1_category'] == 'high']

# print the DataFrames
print("Star 1 with 1-5 movies:")
display(star1_group_1_5.head())

print("\nStar 1 with 6-10 movies:")
display(star1_group_6_10.head())

print("\nStar 1 with 11 or more movies:")
display(star1_group_11_plus.head())

Star 1 with 1-5 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category,star1_category
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0,low,low
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0,low,low
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0,low,low
5,2003,U,201,"Action, Adventure, Drama",8.9,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905.0,low,low
6,1994,A,154,"Crime, Drama",8.9,94.0,Quentin Tarantino,John Travolta,Uma Thurman,Samuel L. Jackson,Bruce Willis,1826188,107928762.0,average,low



Star 1 with 6-10 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category,star1_category
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0,average,average
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0,low,average
8,2010,UA,148,"Action, Adventure, Sci-Fi",8.8,74.0,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2067042,292576195.0,average,average
12,1966,A,161,Western,8.8,90.0,Sergio Leone,Clint Eastwood,Eli Wallach,Lee Van Cleef,Aldo Giuffrè,688390,6100000.0,low,average
15,1990,A,146,"Biography, Crime, Drama",8.7,90.0,Martin Scorsese,Robert De Niro,Ray Liotta,Joe Pesci,Lorraine Bracco,1020727,46836394.0,average,average



Star 1 with 11 or more movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category,star1_category
11,1994,UA,142,"Drama, Romance",8.8,82.0,Robert Zemeckis,Tom Hanks,Robin Wright,Gary Sinise,Sally Field,1809221,330252182.0,low,high
22,1998,R,169,"Drama, War",8.6,91.0,Steven Spielberg,Tom Hanks,Matt Damon,Tom Sizemore,Edward Burns,1235804,216540909.0,high,high
23,1999,A,189,"Crime, Drama, Fantasy",8.6,61.0,Frank Darabont,Tom Hanks,Michael Clarke Duncan,David Morse,Bonnie Hunt,1147794,136801374.0,low,high
80,1995,U,81,"Animation, Adventure, Comedy",8.3,95.0,John Lasseter,Tom Hanks,Tim Allen,Don Rickles,Jim Varney,887429,191796233.0,low,high
111,2010,U,103,"Animation, Adventure, Comedy",8.2,92.0,Lee Unkrich,Tom Hanks,Tim Allen,Joan Cusack,Ned Beatty,757032,415004880.0,low,high


## Star 2 categorization

In [340]:
# Calculate value counts
star2_counts = imdb_df['star2'].value_counts().reset_index()
star2_counts.columns = ['star2', 'count']
star2_counts

Unnamed: 0,star2,count
0,Emma Watson,7
1,Matt Damon,5
2,Kate Winslet,4
3,Julie Delpy,4
4,Brad Pitt,4
...,...,...
594,Mathieu Amalric,1
595,Robert Rodriguez,1
596,Hee Jae,1
597,Edgar Ramírez,1


In [341]:
# Apply categorization
star2_counts['star2_category'] = star2_counts['count'].apply(categorize_count)

# Merge the star1_counts counts back to the original DataFrame
imdb_df = imdb_df.merge(star2_counts[['star2', 'star2_category']], on='star2', how='left')

# Create DataFrames for each category
star2_group_1_5 = imdb_df[imdb_df['star2_category'] == 'low']
star2_group_6_10 = imdb_df[imdb_df['star2_category'] == 'average']
star2_group_11_plus = imdb_df[imdb_df['star2_category'] == 'high']

# print the DataFrames
print("Star 2 with 1-5 movies:")
display(star2_group_1_5.head())

print("\nStar 2 with 6-10 movies:")
display(star2_group_6_10.head())

Star 2 with 1-5 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category,star1_category,star2_category
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0,low,low,low
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0,low,low,low
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0,average,average,low
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0,low,average,low
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0,low,low,low



Star 2 with 6-10 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category,star1_category,star2_category
151,2011,UA,130,"Adventure, Drama, Fantasy",8.1,85.0,David Yates,Daniel Radcliffe,Emma Watson,Rupert Grint,Michael Gambon,764493,381011219.0,low,average,average
220,2012,UA,103,"Drama, Romance",8.0,67.0,Stephen Chbosky,Logan Lerman,Emma Watson,Ezra Miller,Paul Rudd,462252,17738570.0,low,low,average
338,2004,U,142,"Adventure, Family, Fantasy",7.9,82.0,Alfonso Cuarón,Daniel Radcliffe,Emma Watson,Rupert Grint,Richard Griffiths,552493,249358727.0,low,average,average
389,2019,U,135,"Drama, Romance",7.8,91.0,Greta Gerwig,Saoirse Ronan,Emma Watson,Florence Pugh,Eliza Scanlen,143250,108101214.0,low,low,average
515,2010,A,146,"Adventure, Family, Fantasy",7.7,65.0,David Yates,Daniel Radcliffe,Emma Watson,Rupert Grint,Bill Nighy,479120,295983305.0,low,average,average


## Star 3 categorization

In [343]:
# Calculate value counts
star3_counts = imdb_df['star3'].value_counts().reset_index()
star3_counts.columns = ['star3', 'count']
star3_counts

Unnamed: 0,star3,count
0,Rupert Grint,5
1,Joe Pesci,4
2,Rachel McAdams,4
3,Jennifer Connelly,4
4,Samuel L. Jackson,4
...,...,...
621,Lola Dueñas,1
622,Craig Bierko,1
623,Saif Ali Khan,1
624,Billy Crudup,1


In [344]:
# Apply categorization
star3_counts['star3_category'] = star3_counts['count'].apply(categorize_count)

# Merge the star1_counts counts back to the original DataFrame
imdb_df = imdb_df.merge(star3_counts[['star3', 'star3_category']], on='star3', how='left')

# Create DataFrames for each category
star3_group_1_5 = imdb_df[imdb_df['star3_category'] == 'low']
star3_group_6_10 = imdb_df[imdb_df['star3_category'] == 'average']
star3_group_11_plus = imdb_df[imdb_df['star3_category'] == 'high']

# print the DataFrames
print("Star 3 with 1-5 movies:")
display(star3_group_1_5.head())

Star 3 with 1-5 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category,star1_category,star2_category,star3_category
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0,low,low,low,low
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0,low,low,low,low
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0,average,average,low,low
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0,low,average,low,low
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0,low,low,low,low


## Star 4 categorization

In [346]:
# Calculate value counts
star4_counts = imdb_df['star4'].value_counts().reset_index()
star4_counts.columns = ['star4', 'count']
star4_counts

Unnamed: 0,star4,count
0,Michael Caine,4
1,Mark Ruffalo,3
2,Catherine Keener,3
3,Stellan Skarsgård,2
4,Kevin Bacon,2
...,...,...
666,Anthony Chau-Sang Wong,1
667,Keira Knightley,1
668,Jessica Lange,1
669,Jason Lee,1


In [347]:
# Apply categorization
star4_counts['star4_category'] = star4_counts['count'].apply(categorize_count)

# Merge the star1_counts counts back to the original DataFrame
imdb_df = imdb_df.merge(star4_counts[['star4', 'star4_category']], on='star4', how='left')

# Create DataFrames for each category
star4_group_1_5 = imdb_df[imdb_df['star4_category'] == 'low']
star4_group_6_10 = imdb_df[imdb_df['star4_category'] == 'average']
star4_group_11_plus = imdb_df[imdb_df['star4_category'] == 'high']

# print the DataFrames
print("Star 4 with 1-5 movies:")
display(star4_group_1_5.head())

Star 4 with 1-5 movies:


Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross,director_category,star1_category,star2_category,star3_category,star4_category
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0,low,low,low,low,low
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0,low,low,low,low,low
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0,average,average,low,low,low
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0,low,average,low,low,low
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0,low,low,low,low,low


## Cleaned CSV

In [368]:
imdb_df.to_csv('director_star_grouping_cleaned.csv',index=False)