In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import joblib

import sys
sys.path.append("..") 
from src.eda_utils import conv_numerico


In [140]:
path = '../data/processed/data_cleaned.csv'
df = pd.read_csv(path)

In [141]:
# Colunas numéricas e categóricas para pré-processamento e drop de colunas nos dados de treino e teste

num_cols = ['Released_Year', 'Runtime', 'Meta_score', 'No_of_Votes','Gross']
cat_cols = ['Certificate']
drop_cols = ['Series_Title', 'Overview', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating']


In [142]:
def fe_genre (df):
    df[['Genre1', 'Genre2', 'Genre3']] = df['Genre'].str.split(',', expand=True)

    generos = pd.concat([df['Genre1'], 
                        df['Genre2'], 
                        df['Genre3']])

    # Dropando os valores NaN, removendo espaços em branco e pegando os valores únicos
    generos_unicos = generos.dropna().str.strip().unique()
    print(f"Número de gêneros únicos na base: {len(generos_unicos)}\n")
    print(f"Gêneros presentes na base:{generos_unicos}\n")

    # Criando colunas para cada gênero com variável booleana
    for genero in generos_unicos:
        df[genero] = df['Genre'].str.contains(genero, na=False).astype(int)

    # Dropando as colunas 'Genre', 'Genre1', 'Genre2' e 'Genre3'
    df_genres = df.drop(columns=['Genre', 'Genre1', 'Genre2', 'Genre3'])

    # Shape do novo dataframe
    print(f"Shape do dataframe: {df_genres.shape}\n")
    print(f"Colunas do novo dataframe: {df_genres.columns}\n")

    return df_genres

In [143]:
def fe_directors(train_df, test_df):
    top_n = [10, 25, 50]

    # Criando listas com os 50 melhores diretores do treino
    top50_dir_metascore = train_df.groupby('Director')['Meta_score'].mean().sort_values(ascending=False).head(50).index.tolist()
    top50_dir_no_of_votes = train_df.groupby('Director')['No_of_Votes'].mean().sort_values(ascending=False).head(50).index.tolist()
    top50_dir_filmes = train_df['Director'].value_counts().head(50).index.tolist()

    # Função para criar colunas booleanas no dataset
    def criar_colunas_top_n(df, top_lists, top_n):

        for n in top_n:
            df[f'top{n}_dir_metascore'] = df['Director'].isin(top_lists['metascore'][:n]).astype(int)
            df[f'top{n}_dir_no_of_votes'] = df['Director'].isin(top_lists['no_of_votes'][:n]).astype(int)
            df[f'top{n}_dir_filmes'] = df['Director'].isin(top_lists['filmes'][:n]).astype(int)

    # Dicionário com listas top50
    top_lists = {
        'metascore': top50_dir_metascore,
        'no_of_votes': top50_dir_no_of_votes,
        'filmes': top50_dir_filmes
    }

    # Criando as colunas no dataset de treino
    criar_colunas_top_n(train_df, top_lists, top_n)

    # Criando as colunas no dataset de teste
    criar_colunas_top_n(test_df, top_lists, top_n)

    return train_df, test_df

In [144]:
def fe_actors(train_df, test_df):
    # Feature Engineering da coluna 'Actors'

    atores = []

    for col in ['Star1', 'Star2', 'Star3', 'Star4']:
        temp_df = train_df[[col, 'Meta_score', 'No_of_Votes']].copy()
        temp_df.columns = ['Ator', 'Meta_score', 'No_of_Votes']
        atores.append(temp_df)

    atores_df = pd.concat(atores).dropna()

    # Calcular médias e quantidade de filmes por ator
    ator_metascore = atores_df.groupby('Ator')['Meta_score'].mean()
    ator_votes = atores_df.groupby('Ator')['No_of_Votes'].mean()
    ator_filmes = atores_df.groupby('Ator').size()

    # Top atores usando loop
    top_n = [10, 25, 50]

    # Dicionários para guardar os tops
    top_metascore = {}
    top_votes = {}
    top_filmcount = {}

    for n in top_n:
        top_metascore[n] = set(ator_metascore.nlargest(n).index)
        top_votes[n] = set(ator_votes.nlargest(n).index)
        top_filmcount[n] = set(ator_filmes.nlargest(n).index)

    # Função para contar
    def top_atores(row, top_set):
        atores = [row['Star1'], row['Star2'], row['Star3'], row['Star4']]
        return sum(1 for ator in atores if ator in top_set)

    for n in top_n:
        train_df[f'top{n}_atores_metascore'] = train_df.apply(lambda x: top_atores(x, top_metascore[n]), axis=1)
        train_df[f'top{n}_atores_votes'] = train_df.apply(lambda x: top_atores(x, top_votes[n]), axis=1)
        train_df[f'top{n}_atores_filmes'] = train_df.apply(lambda x: top_atores(x, top_filmcount[n]), axis=1)

    for n in top_n:
        test_df[f'top{n}_atores_metascore'] = test_df.apply(lambda x: top_atores(x, top_metascore[n]), axis=1)
        test_df[f'top{n}_atores_votes'] = test_df.apply(lambda x: top_atores(x, top_votes[n]), axis=1)
        test_df[f'top{n}_atores_filmes'] = test_df.apply(lambda x: top_atores(x, top_filmcount[n]), axis=1)

    return train_df, test_df

In [145]:
def preparar_e_salvar_dados(train_df, test_df, drop_cols):
    # Guardando títulos para referência
    train_titles = train_df['Series_Title'].reset_index(drop=True)
    test_titles = test_df['Series_Title'].reset_index(drop=True)

    # Definindo as features e o target
    X_train = train_df.drop(drop_cols, axis=1).reset_index(drop=True)  
    y_train = train_df['IMDB_Rating'].reset_index(drop=True)                 

    X_test = test_df.drop(drop_cols, axis=1).reset_index(drop=True)    
    y_test = test_df['IMDB_Rating'].reset_index(drop=True)                  

    # Verificando o shape dos datasets
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_test shape: {y_test.shape}")

    # Verificando se há valores nulos nas features
    print(f"\nValores nulos em X_train: {X_train.isnull().sum().sum()}")
    print(f"Valores nulos em X_test: {X_test.isnull().sum().sum()}")

    # Verificando se há valores nulos na target
    print(f"\nValores nulos em y_train: {y_train.isnull().sum()}")
    print(f"Valores nulos em y_test: {y_test.isnull().sum()}")

    # Verificando as colunas
    print(f"Quantidade de colunas em X_train: {X_train.shape[1]}")
    print(f"Colunas em X_train: {X_train.columns.tolist()}")

    X_train.head()

    df_train_model = X_train.copy()
    df_train_model['IMDB_Rating'] = y_train
    df_train_model.to_csv('../data/train/data_train_model.csv', index=False)

    df_test_model = X_test.copy()
    df_test_model['IMDB_Rating'] = y_test
    df_test_model.to_csv('../data/test/data_test_model.csv', index=False)

    return train_titles, test_titles, X_train, y_train, X_test, y_test

In [146]:
# Pipeline de Feature Engineering

def pipeline_feature_engineering(df, drop_cols):
    # Cadeia de transformações
    df_processed = (df.copy()
                   .pipe(fe_genre))
    
    train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
    
    # Aplicando transformações nos datasets divididos
    train_df, test_df = fe_directors(train_df.copy(), test_df.copy())
    train_df, test_df = fe_actors(train_df, test_df)
    
    # Preparação final
    results = preparar_e_salvar_dados(train_df, test_df, drop_cols)
    
    return results

In [147]:
# Definindo espaço de busca para otimização de hiperparâmetros

def espaco_busca_lr(trial):
    # Espaço de busca para regressão linear
    return {
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
    }

def espaco_busca_rf(trial):
    # Espaço de busca para o random forest
    return {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'random_state': 42 
    }

def espaco_busca_xgb(trial):
    # Espaço de busca para o XGBoost
    return {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400, step=50),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.01),
        'random_state': 42,
        'eval_metric': 'rmse'
    }


In [148]:
# Modelos utilizados e seus respectivos espaços de busca

MODEL_CONFIG = {
    "LinearRegression": {
        "classe_modelo": LinearRegression,
        "espaco_busca_modelo": espaco_busca_lr
    },
    "RandomForest": {
        "classe_modelo": RandomForestRegressor,
        "espaco_busca_modelo": espaco_busca_rf
    },
    "XGBoost": {
        "classe_modelo": XGBRegressor,
        "espaco_busca_modelo": espaco_busca_xgb
    }
}

In [149]:
def objective(trial, classe_modelo, espaco_busca, df, drop_cols, cat_cols, num_cols):
    
    # Executando o pipeline completo de feature engineering
    train_titles, test_titles, X_train, y_train, X_test, y_test = pipeline_feature_engineering(df, drop_cols)

    # Definindo o KFold para validação cruzada dos modelos
    kf = KFold(n_splits=3, shuffle=True, random_state=42)

    # Configurando o espaço de busca e o modelo
    params = espaco_busca(trial)
    model = classe_modelo(**params)

    preprocessor = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', num_cols),
                ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), cat_cols)
            ]
        )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # Avaliando o modelo com validação cruzada
    scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

    # Valor de referência a ser otimizado, retorna mse negativo pois cross_val_score retorna valores negativos para métricas de erro
    mse_medio = -scores.mean()
    
    return mse_medio

In [150]:
# Otimização dos hiperparâmetros com Optuna

def otimizacao(trial, classe_modelo, espaco_busca):
    return objective(
        trial,
        classe_modelo=classe_modelo,
        espaco_busca=espaco_busca,
        df=df,
        drop_cols=drop_cols,
        cat_cols=cat_cols,
        num_cols=num_cols
    )

In [151]:
def plot_rf_feature_importance(rf_model, preprocessor, top_n=20):
    """
    Plota a importância das features do Random Forest
    
    Args:
        rf_model: Modelo Random Forest treinado
        preprocessor: Preprocessor do pipeline
        top_n: Número de features mais importantes a mostrar
    """
    
    importance = rf_model.feature_importances_

    # Obtendo os nomes das features processadas
    num_features = preprocessor.transformers_[0][2]  # colunas numéricas
    cat_encoder = preprocessor.named_transformers_['cat']
    cat_features = cat_encoder.get_feature_names_out(preprocessor.transformers_[1][2])  # nomes one-hot
    
    all_feature_names = list(num_features) + list(cat_features)

    # Garantir que os comprimentos batem
    if len(all_feature_names) != len(importance):
        raise ValueError(f"Número de features ({len(all_feature_names)}) "
                         f"≠ número de importâncias ({len(importance)})")

    # Criando DataFrame
    df_importance = pd.DataFrame({'Feature': all_feature_names, 'Importance': importance})
    df_importance = df_importance.sort_values(by='Importance', ascending=False).head(top_n)
    
    # Plot
    plt.figure(figsize=(10,6))
    sns.barplot(data=df_importance, x='Importance', y='Feature', palette='viridis')
    plt.title(f'Top {top_n} Feature Importance - Random Forest')
    plt.tight_layout()
    plt.show()


In [152]:
# Otimização de hiperparâmetros utilizando a função otimizacao e objective no optuna, treinamento e avaliação dos modelos

def treinar_e_otimizar_modelos(MODEL_CONFIG, df_original, drop_cols, num_cols, cat_cols, n_trials=10):

    modelos_treinados = {}
    best_params = {}
    resultados_test = {}
    
    # Executando o pipeline de FE uma vez para obter os dados processados
    train_titles, test_titles, X_train, y_train, X_test, y_test = pipeline_feature_engineering(
        df_original, drop_cols
    )
    print(f"FE concluído. Shape treino: {X_train.shape}, Shape teste: {X_test.shape}")

    for nome_modelo, config in MODEL_CONFIG.items():
        print(f"\nOtimizando {nome_modelo}...")

        # Criando estudo Optuna
        study = optuna.create_study(direction='minimize')
        
        # Função objective modificada que usa os dados já processados
        def objective(trial):
            # Definindo o KFold para validação cruzada dos modelos
            kf = KFold(n_splits=3, shuffle=True, random_state=42)

            # Configurando o espaço de busca e o modelo
            params = config['espaco_busca_modelo'](trial)
            model = config['classe_modelo'](**params)

            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', 'passthrough', num_cols),
                    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), cat_cols)
                ]
            )

            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])

            # Avaliando o modelo com validação cruzada
            scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

            # Valor de referência a ser otimizado
            mse_medio = -scores.mean()
            
            return mse_medio
        
        study.optimize(objective, n_trials=n_trials)

        best_params[nome_modelo] = study.best_params
        print(f"Melhores parâmetros {nome_modelo}: {study.best_params}")

        # Treinando modelo final com melhores parâmetros
        modelo_final = config['classe_modelo'](**study.best_params)
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', num_cols),
                ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), cat_cols)
            ]
        )

        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', modelo_final)
        ])

        pipeline.fit(X_train, y_train)
        modelos_treinados[nome_modelo] = pipeline

        # Avaliando no conjunto de teste
        y_pred = pipeline.predict(X_test)
        resultados_test[nome_modelo] = {
            'R2': r2_score(y_test, y_pred),
            'RMSE': mean_squared_error(y_test, y_pred, squared=False)
        }

        print(f"Métricas no teste {nome_modelo}: {resultados_test[nome_modelo]}")

    # Comparando os resultados dos modelos
    print("\nResultados dos modelos no conjunto de teste:")
    
    for nome_modelo, metrics in resultados_test.items():
        print(f"{nome_modelo}: R2 = {metrics['R2']:.4f}, RMSE = {metrics['RMSE']:.4f}")

    print("\nTreinamento e otimização concluídos.")
    
    # Retornando também os dados processados para uso posterior
    dados_processados = {
        'train_titles': train_titles,
        'test_titles': test_titles,
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

    return modelos_treinados, best_params, resultados_test, dados_processados

In [153]:
modelos, params, resultados, dados = treinar_e_otimizar_modelos(
    MODEL_CONFIG, 
    df,    
    drop_cols,        
    num_cols,       
    cat_cols,       
    n_trials=15)

[I 2025-09-04 16:17:52,091] A new study created in memory with name: no-name-6934493f-6482-48b5-b3b6-877332193b43


Número de gêneros únicos na base: 21

Gêneros presentes na base:['Crime' 'Action' 'Biography' 'Drama' 'Western' 'Comedy' 'Adventure'
 'Animation' 'Horror' 'Mystery' 'Film-Noir' 'Family' 'Romance' 'Sci-Fi'
 'War' 'Music' 'Thriller' 'Musical' 'Fantasy' 'Sport' 'History']

Shape do dataframe: (712, 35)

Colunas do novo dataframe: Index(['Series_Title', 'Released_Year', 'Certificate', 'Runtime',
       'IMDB_Rating', 'Overview', 'Meta_score', 'Director', 'Star1', 'Star2',
       'Star3', 'Star4', 'No_of_Votes', 'Gross', 'Crime', 'Action',
       'Biography', 'Drama', 'Western', 'Comedy', 'Adventure', 'Animation',
       'Horror', 'Mystery', 'Film-Noir', 'Family', 'Romance', 'Sci-Fi', 'War',
       'Music', 'Thriller', 'Musical', 'Fantasy', 'Sport', 'History'],
      dtype='object')

X_train shape: (569, 45)
y_train shape: (569,)
X_test shape: (143, 45)
y_test shape: (143,)

Valores nulos em X_train: 0
Valores nulos em X_test: 0

Valores nulos em y_train: 0
Valores nulos em y_test: 0
Quanti

[I 2025-09-04 16:17:52,163] Trial 0 finished with value: 0.03792708478189042 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.03792708478189042.
[I 2025-09-04 16:17:52,188] Trial 1 finished with value: 0.04846320939999391 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.03792708478189042.
[I 2025-09-04 16:17:52,209] Trial 2 finished with value: 0.03792708478189042 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.03792708478189042.
[I 2025-09-04 16:17:52,231] Trial 3 finished with value: 0.03792708478189042 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.03792708478189042.
[I 2025-09-04 16:17:52,259] Trial 4 finished with value: 0.03792708478189042 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 0.03792708478189042.
[I 2025-09-04 16:17:52,279] Trial 5 finished with value: 0.04846320939999391 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 0.037927084781890

Melhores parâmetros LinearRegression: {'fit_intercept': True}
Métricas no teste LinearRegression: {'R2': 0.5850022623896449, 'RMSE': 0.18644698600676912}

Otimizando RandomForest...


[I 2025-09-04 16:17:56,033] Trial 0 finished with value: 0.03263640809125224 and parameters: {'n_estimators': 700, 'max_depth': 19, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.03263640809125224.
[I 2025-09-04 16:17:56,443] Trial 1 finished with value: 0.03607598971041515 and parameters: {'n_estimators': 100, 'max_depth': 7, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.03263640809125224.
[I 2025-09-04 16:17:57,388] Trial 2 finished with value: 0.032658344679963314 and parameters: {'n_estimators': 200, 'max_depth': 12, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.03263640809125224.
[I 2025-09-04 16:17:57,870] Trial 3 finished with value: 0.032380024995780333 and parameters: {'n_estimators': 100, 'max_depth': 29, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.032380024995780333.
[I 2025-09-04 16:18:00,020] Trial 4 finished with value: 0.037499787554693555 and parameters: {'n_estimators': 600, 'max_depth': 16, 'min_samples_leaf': 10}. Best is trial 3 with

Melhores parâmetros RandomForest: {'n_estimators': 300, 'max_depth': 23, 'min_samples_leaf': 1}


[I 2025-09-04 16:18:19,983] A new study created in memory with name: no-name-4876e726-9a81-42f5-b05a-7526bd619045


Métricas no teste RandomForest: {'R2': 0.5952989932071745, 'RMSE': 0.18411943988337762}

Otimizando XGBoost...


[I 2025-09-04 16:18:20,339] Trial 0 finished with value: 0.03932030834050921 and parameters: {'n_estimators': 150, 'max_depth': 16, 'learning_rate': 0.3}. Best is trial 0 with value: 0.03932030834050921.
[I 2025-09-04 16:18:20,690] Trial 1 finished with value: 0.041654557985533684 and parameters: {'n_estimators': 400, 'max_depth': 13, 'learning_rate': 0.25}. Best is trial 0 with value: 0.03932030834050921.
[I 2025-09-04 16:18:21,144] Trial 2 finished with value: 0.04108051825177656 and parameters: {'n_estimators': 150, 'max_depth': 20, 'learning_rate': 0.17}. Best is trial 0 with value: 0.03932030834050921.
[I 2025-09-04 16:18:21,589] Trial 3 finished with value: 0.04161103658058674 and parameters: {'n_estimators': 250, 'max_depth': 23, 'learning_rate': 0.24000000000000002}. Best is trial 0 with value: 0.03932030834050921.
[I 2025-09-04 16:18:21,866] Trial 4 finished with value: 0.03668218794386154 and parameters: {'n_estimators': 150, 'max_depth': 7, 'learning_rate': 0.05}. Best is tr

Melhores parâmetros XGBoost: {'n_estimators': 250, 'max_depth': 5, 'learning_rate': 0.09}
Métricas no teste XGBoost: {'R2': 0.5277178601208141, 'RMSE': 0.19889930417604357}

Resultados dos modelos no conjunto de teste:
LinearRegression: R2 = 0.5850, RMSE = 0.1864
RandomForest: R2 = 0.5953, RMSE = 0.1841
XGBoost: R2 = 0.5277, RMSE = 0.1989

Treinamento e otimização concluídos.




In [154]:
best_model = modelos['RandomForest']
def comparar_imdb_previsto(best_model, dados_processados, nomes_filmes_col='test_titles'):
    # Dados de teste
    X_test = dados_processados['X_test']
    y_test = dados_processados['y_test']
    test_titles = dados_processados[nomes_filmes_col]

    # Previsões
    y_pred = best_model.predict(X_test)
    
    # Cria o DataFrame
    df_plot = pd.DataFrame({
        'Filme': test_titles,
        'IMDB_Real': y_test,
        'IMDB_Previsto': y_pred
    })
    
    # Arredonda a coluna 'IMDB_Previsto' para 1 casa decimal
    df_plot['IMDB_Previsto'] = df_plot['IMDB_Previsto'].round(1)

    # Exportando para csv
    df_plot.to_csv("../data/processed/valores_previstos.csv", sep=";")
    
    # Exibe o DataFrame
    print(df_plot)

# Chame a função para criar e mostrar o DataFrame com os valores arredondados
comparar_imdb_previsto(best_model, dados, nomes_filmes_col='test_titles')

                       Filme  IMDB_Real  IMDB_Previsto
0    Star Trek Into Darkness        7.7            7.8
1               Kaze tachinu        7.8            7.9
2                  Gully Boy        8.0            8.0
3            The Incredibles        8.0            8.0
4                  Cast Away        7.8            7.8
..                       ...        ...            ...
138           Doctor Zhivago        8.0            7.9
139       Back to the Future        8.5            8.4
140      There Will Be Blood        8.2            8.0
141     (500) Days of Summer        7.7            7.8
142                   WALL·E        8.4            8.2

[143 rows x 3 columns]




In [155]:
# Variaveis do filme Shawshank Redemption

filme = {'Series_Title': 'The Shawshank Redemption',
        'Released_Year': '1994',
        'Certificate': 'A',
        'Runtime': '142 min',
        'Genre': 'Drama',
        'Overview': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
        'Meta_score': 80.0,
        'Director': 'Frank Darabont',
        'Star1': 'Tim Robbins',
        'Star2': 'Morgan Freeman',
        'Star3': 'Bob Gunton',
        'Star4': 'William Sadler',
        'No_of_Votes': 2343110,
        'Gross': '28,341,469'}

In [156]:
# Exportando melhor modelo para .pkl
joblib.dump(best_model, "..\models\RandomForest.pkl")

  joblib.dump(best_model, "..\models\RandomForest.pkl")


['..\\models\\RandomForest.pkl']

In [159]:
# Prevendo IMDB_Rating de filmes

melhor_modelo = joblib.load("../models/RandomForest.pkl")

# Prevendo IMDB_Rating de filmes

melhor_modelo = joblib.load("../models/RandomForest.pkl")

def prever_imdb_rating(filme, melhor_modelo):
    melhor_modelo = modelos['RandomForest']
    
    df_filme = pd.DataFrame([filme])
    df_filme = conv_numerico(df_filme)
    imdb_rating_pred = melhor_modelo.predict(df_filme)
    
    # A linha abaixo foi corrigida para pegar apenas o valor do título
    titulo_filme = df_filme['Series_Title'].iloc[0]
    
    print(f"IMDB previsto para o filme {titulo_filme} é: {imdb_rating_pred[0]:.1f}")
    

    return df_filme

prever_imdb_rating(filme,melhor_modelo)

IMDB previsto para o filme The Shawshank Redemption é: 8.8


Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142,Drama,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
