#### Importando bibliotecas necessárias

In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, f1_score
from sklearn.preprocessing import Binarizer

#### Lendo dados pré-processados

In [26]:
df = pd.read_csv('db_polished/db_final.tsv', delimiter='\t')

#### Filtrando as categorias relevantes

In [27]:
categories = ['actor', 'actress', 'writer', 'director']
df = df.loc[df['category'].isin(categories)]

#### Calculando a medianas das avaliações para cada escritor, diretor e ator

In [28]:
writer_avg_rating = df.loc[df['category'] == 'writer'].groupby('primaryName')['averageRating'].median().reset_index()
director_avg_rating = df.loc[df['category'] == 'director'].groupby('primaryName')['averageRating'].median().reset_index()
actor_avg_rating = df.loc[df['category'].isin(['actor', 'actress'])].groupby('primaryName')['averageRating'].median().reset_index()

#### Renomeando colunas para facilitar os merges

In [29]:
writer_avg_rating.rename(columns={'averageRating': 'writer_avg'}, inplace=True)
director_avg_rating.rename(columns={'averageRating': 'director_avg'}, inplace=True)
actor_avg_rating.rename(columns={'averageRating': 'actor_avg'}, inplace=True)

#### Mergendo dos dados originais com as médiaiys de avaliações

In [None]:
movies = df.drop_duplicates(subset=['originalTitle'])
movies = movies.merge(writer_avg_rating, on='primaryName', how='left')
movies = movies.merge(director_avg_rating, on='primaryName', how='left')
movies = movies.merge(actor_avg_rating, on='primaryName', how='left')

#### Preenchendo NaNs com 0 para evitar problemas durante o treinamento

In [None]:
movies['writer_avg'] = movies['writer_avg'].fillna(0)
movies['director_avg'] = movies['director_avg'].fillna(0)
movies['actor_avg'] = movies['actor_avg'].fillna(0)

#### Selecionando as colunas submissas para o modelo  

In [None]:
X = movies[['writer_avg', 'director_avg', 'actor_avg']]
y = movies['averageRating']

#### Dividindo os dados em treino e teste, com teste em 20% e treino em 80%

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Treinando o modelo de Regressão Linear; Realizando a previsão

In [None]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

y_pred_lr = model_lr.predict(X_test)

#### Treinando o modelo de Árvore de Decisão; Realizando a previsão

In [32]:
model_dt = DecisionTreeRegressor(random_state=42)
model_dt.fit(X_train, y_train)

y_pred_dt = model_dt.predict(X_test)

#### Binarização das previsões para calcular métricas de classificação

In [33]:
binarizer = Binarizer(threshold=5.0)
y_test_bin = binarizer.fit_transform(y_test.values.reshape(-1, 1))
y_pred_lr_bin = binarizer.transform(y_pred_lr.reshape(-1, 1))
y_pred_dt_bin = binarizer.transform(y_pred_dt.reshape(-1, 1))

#### Avaliando os modelos

In [34]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_dt = mean_squared_error(y_test, y_pred_dt)

accuracy_lr = accuracy_score(y_test_bin, y_pred_lr_bin)
accuracy_dt = accuracy_score(y_test_bin, y_pred_dt_bin)

precision_lr = precision_score(y_test_bin, y_pred_lr_bin)
precision_dt = precision_score(y_test_bin, y_pred_dt_bin)

f1_lr = f1_score(y_test_bin, y_pred_lr_bin)
f1_dt = f1_score(y_test_bin, y_pred_dt_bin)

print(f'Modelo Linear Regression - Mean Squared Error: {mse_lr}')
print(f'Modelo Decision Tree - Mean Squared Error: {mse_dt}')

print(f'Modelo Linear Regression - Accuracy: {accuracy_lr}')
print(f'Modelo Decision Tree - Accuracy: {accuracy_dt}')

print(f'Modelo Linear Regression - Precision: {precision_lr}')
print(f'Modelo Decision Tree - Precision: {precision_dt}')

print(f'Modelo Linear Regression - F1 Score: {f1_lr}')
print(f'Modelo Decision Tree - F1 Score: {f1_dt}')

Modelo Linear Regression - Mean Squared Error: 1.752959228766583
Modelo Decision Tree - Mean Squared Error: 1.130011950246099
Modelo Linear Regression - Accuracy: 0.903784585816641
Modelo Decision Tree - Accuracy: 0.9204744051264433
Modelo Linear Regression - Precision: 0.903784585816641
Modelo Decision Tree - Precision: 0.9277228592541301
Modelo Linear Regression - F1 Score: 0.9494609763624666
Modelo Decision Tree - F1 Score: 0.9574120884395098


#### Funções para prever a avaliação de um novo filme

In [None]:
def get_avg_rating(names, rating_dict):
    ratings = [rating_dict[name] for name in names if name in rating_dict]
    return sum(ratings) / len(ratings) if ratings else rating_dict.median()

def prever_classificacao(writer, directors, actors, model):
    writer_avg = get_avg_rating(writer, writer_avg_rating.set_index('primaryName')['writer_avg'])
    director_avg = get_avg_rating(directors, director_avg_rating.set_index('primaryName')['director_avg'])
    actor_avg = get_avg_rating(actors, actor_avg_rating.set_index('primaryName')['actor_avg'])
    previsao = model.predict([[writer_avg, director_avg, actor_avg]])
    return previsao[0]

#### Exemplo de uso

In [45]:
writer = ['Joan Crawford']
directors = []
actors = []

print(f'Previsão de classificação (Regressão Linear): +{prever_classificacao(writer, directors, actors, model_lr):.2f}')
print(f'Previsão de classificação (Árvore de Decisão): +{prever_classificacao(writer, directors, actors, model_dt):.2f}')



Previsão de classificação (Regressão Linear): +7.32
Previsão de classificação (Árvore de Decisão): +6.79


