In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('imdb_dataset.csv')

df

Unnamed: 0,Title,IMDb Rating,Year,Certificates,Genre,Director,Star Cast,MetaScore,Poster-src,Duration (minutes)
0,End of the Spear,6.8,2005,PG-13,Adventure,Jim Hanon,Louie LeonardoChad AllenJack Guzman,45.0,https://m.media-amazon.com/images/M/MV5BMTYxOT...,108.0
1,Elvira Madigan,7.0,1967,PG,Biography,Bo Widerberg,Pia DegermarkThommy BerggrenLennart Malmer,66.0,https://m.media-amazon.com/images/M/MV5BMmY2Nj...,91.0
2,The Kid Stays in the Picture,7.3,2002,R,Documentary,Nanette Burstein,Robert EvansEddie AlbertPeter Bart,75.0,https://m.media-amazon.com/images/M/MV5BZjhiZm...,93.0
3,It Ain't Over,8.2,2022,PG,Documentary,Sean Mullin,Andy AndresRoger AngellMarty Appel,79.0,https://m.media-amazon.com/images/M/MV5BZWViYW...,99.0
4,Mahler,7.0,1974,PG,Biography,Ken Russell,Robert PowellGeorgina HaleLee Montague,66.0,https://m.media-amazon.com/images/M/MV5BYzY4Mz...,115.0
...,...,...,...,...,...,...,...,...,...,...
3168,The Lord of the Rings: The Return of the King,9.0,2003,PG-13,Action,Peter Jackson,J.R.R. TolkienFran WalshPhilippa Boyens,94.0,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,201.0
3169,Schindler's List,9.0,1993,R,Biography,Steven Spielberg,Liam NeesonRalph FiennesBen Kingsley,95.0,https://m.media-amazon.com/images/M/MV5BNDE4OT...,195.0
3170,The Dark Knight,9.0,2008,PG-13,Action,Christopher Nolan,Jonathan NolanChristopher NolanDavid S. Goyer,84.0,https://m.media-amazon.com/images/M/MV5BMTMxNT...,152.0
3171,The Godfather,9.2,1972,R,Crime,Francis Ford Coppola,Mario PuzoFrancis Ford Coppola,100.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,175.0


In [3]:
# extracting main actors from data (data mining)
df['Main Actor'] = df['Star Cast'].str.extract(r'([A-Z][a-z]+ [A-Z][a-z]+)')

# drop useless columns
df = df.drop(columns=["Poster-src", "Title", "Star Cast"])

df

Unnamed: 0,IMDb Rating,Year,Certificates,Genre,Director,MetaScore,Duration (minutes),Main Actor
0,6.8,2005,PG-13,Adventure,Jim Hanon,45.0,108.0,Louie Leonardo
1,7.0,1967,PG,Biography,Bo Widerberg,66.0,91.0,Pia Degermark
2,7.3,2002,R,Documentary,Nanette Burstein,75.0,93.0,Robert Evans
3,8.2,2022,PG,Documentary,Sean Mullin,79.0,99.0,Andy Andres
4,7.0,1974,PG,Biography,Ken Russell,66.0,115.0,Robert Powell
...,...,...,...,...,...,...,...,...
3168,9.0,2003,PG-13,Action,Peter Jackson,94.0,201.0,Fran Walsh
3169,9.0,1993,R,Biography,Steven Spielberg,95.0,195.0,Liam Neeson
3170,9.0,2008,PG-13,Action,Christopher Nolan,84.0,152.0,Jonathan Nolan
3171,9.2,1972,R,Crime,Francis Ford Coppola,100.0,175.0,Mario Puzo


In [4]:
print(f"The dataset contains: {len(df)} elements")

The dataset contains: 3173 elements


In [5]:
df.dtypes

IMDb Rating           float64
Year                    int64
Certificates           object
Genre                  object
Director               object
MetaScore             float64
Duration (minutes)    float64
Main Actor             object
dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder

df["Certificates"] = df["Certificates"].astype('category')
df["Genre"] = df["Genre"].astype('category')

le = LabelEncoder()
df['Main Actor'] = le.fit_transform(df['Main Actor'])
df['Director'] = le.fit_transform(df['Director'])
df["Certificates"] = le.fit_transform(df['Certificates'])
df["Genre"] = le.fit_transform(df['Genre'])

df.dtypes

IMDb Rating           float64
Year                    int64
Certificates            int32
Genre                   int32
Director                int32
MetaScore             float64
Duration (minutes)    float64
Main Actor              int32
dtype: object

In [7]:
df

Unnamed: 0,IMDb Rating,Year,Certificates,Genre,Director,MetaScore,Duration (minutes),Main Actor
0,6.8,2005,6,1,886,45.0,108.0,1244
1,7.0,1967,5,3,211,66.0,91.0,1640
2,7.3,2002,8,6,1382,75.0,93.0,1721
3,8.2,2022,5,6,1766,79.0,99.0,113
4,7.0,1974,5,3,1069,66.0,115.0,1730
...,...,...,...,...,...,...,...,...
3168,9.0,2003,6,0,1529,94.0,201.0,638
3169,9.0,1993,8,3,1863,95.0,195.0,1222
3170,9.0,2008,6,0,348,84.0,152.0,1036
3171,9.2,1972,8,5,589,100.0,175.0,1328


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["Certificates"]), 
    df['Certificates'], 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=1000, random_state=42)

In [10]:
rf_model.fit(X_train, y_train)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Random Forest \n\n")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print("\n\n")


Random Forest 


Accuracy: 0.7149606299212599
Precision: 0.6971259082105546
Recall: 0.7149606299212599
F1-Score: 0.7015180624545825





In [12]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

In [13]:
xgb_model.fit(X_train, y_train)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("XGBoost \n")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

XGBoost 

Accuracy: 0.7275590551181103
Precision: 0.7150715542476899
Recall: 0.7275590551181103
F1-Score: 0.7207547176999116
