In [61]:
import warnings
warnings.filterwarnings('ignore')

In [62]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter


In [63]:
# DB libraries
import psycopg2
from postgres_data import host
from postgres_data import password
from postgres_data import user
from sqlalchemy import create_engine

In [64]:
# ML libraries
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [65]:
# Create an engine instance
alchemyEngine = create_engine(f'postgresql://{user}:{password}@{host}:5432/dfph644rcv92i9')

In [66]:
# Connect to PostgreSQL server
dbConnection = alchemyEngine.connect()

# Read data from PostgreSQL database table and load into a DataFrame instance
df = pd.read_sql("select * from meta", dbConnection)
dbConnection.close()

In [67]:
ml_table = df[['movie_id','title','release_date','movie_popularity', 
            'vote_average', 'vote_count','budget','revenue', 
            'ratings','studios','genres','actor_popularity_mean',
            'director_popularity_mean']]
ml_table

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,studios,genres,actor_popularity_mean,director_popularity_mean
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,"['Warner Bros. Pictures', ' Heyday Films', ' O...","['Adventure', 'Fantasy']",2.993965,2.566
1,557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,"['Other', ' Columbia Pictures', ' Sony Picture...","['Fantasy', 'Action']",2.387022,2.914
2,672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,"['Warner Bros. Pictures', ' Heyday Films', ' O...","['Adventure', 'Fantasy']",2.696712,2.566
3,673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,"['Warner Bros. Pictures', ' Other', ' Heyday F...","['Adventure', 'Fantasy']",3.523069,3.333
4,674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,"['Warner Bros. Pictures', ' Heyday Films', ' O...","['Adventure', 'Fantasy', 'Family']",3.234944,2.695
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,610253,Halloween Kills,2021-10-14,616.978,6.9,1251,20000000,127000000,R,"['Universal Pictures', ' Other', ' Other', ' O...","['Horror', 'Thriller']",3.464000,2.385
70,576845,Last Night in Soho,2021-10-21,685.843,7.5,458,43000000,19000000,R,"['Other', ' Other', ' Other', ' Other', ' Othe...","['Horror', 'Mystery', 'Thriller']",2.869283,7.468
71,524434,Eternals,2021-11-03,1746.171,7.1,1170,200000000,336000000,PG-13,['Marvel Studios'],"['Action', 'Adventure', 'Science Fiction']",5.096703,2.397
72,425909,Ghostbusters: Afterlife,2021-11-11,544.722,7.5,113,75000000,63000000,PG-13,"['Columbia Pictures', ' Other', ' Other', ' Ot...","['Comedy', 'Fantasy']",4.924763,4.999


In [68]:
studio_split = []
for studio in ml_table['studios']:
    x = str(studio)[1:-1]
    studio_split.append(x)

studio_array = []
for studio in studio_split:
    x = studio.split(',')
    studio_array.append(x)


ml_table.drop(['studios'],axis=1,inplace=True)
ml_table['studios'] = studio_array

genre_split = []
for genre in ml_table['genres']:
    x = str(genre)[1:-1]
    genre_split.append(x)
genre_split

genre_array=[]
for genre in genre_split:
    x = genre.split(',')
    genre_array.append(x)

ml_table.drop(['genres'],axis=1,inplace=True)
ml_table['genres'] = genre_array

ml_table = ml_table.reindex(columns=['movie_id','title','release_date','movie_popularity', 
            'vote_average', 'vote_count','budget','revenue', 
            'ratings','studios','genres','actor_popularity_mean',
            'director_popularity_mean'])


In [69]:
studio_revised = []
for test in ml_table['studios']:
    holder = []
    for studio in test:
        word = studio.replace('\'','')
        if word not in holder:
            holder.append(word)
    studio_revised.append(holder)

genre_revised = []
for test in ml_table['genres']:
    holder = []
    for genre in test:
        word = genre.replace('\'','')
        if word not in holder:
            holder.append(word)
    genre_revised.append(holder)   

ml_table.drop(['studios'],axis=1,inplace=True)
ml_table['studios'] = studio_revised

ml_table.drop(['genres'],axis=1,inplace=True)
ml_table['genres'] = genre_revised

ml_table.head()

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres
0,671,Harry Potter and the Philosopher's Stone,2001-11-16,268.472,7.9,21429,125000000,976475550,PG,2.993965,2.566,"[Warner Bros. Pictures, Heyday Films, Other]","[Adventure, Fantasy]"
1,557,Spider-Man,2002-05-01,480.954,7.2,14421,139000000,821708551,PG-13,2.387022,2.914,"[Other, Columbia Pictures, Sony Pictures, ...","[Fantasy, Action]"
2,672,Harry Potter and the Chamber of Secrets,2002-11-13,246.027,7.7,17294,100000000,876688482,PG,2.696712,2.566,"[Warner Bros. Pictures, Heyday Films, Other]","[Adventure, Fantasy]"
3,673,Harry Potter and the Prisoner of Azkaban,2004-05-31,225.882,8.0,17001,130000000,789804554,PG,3.523069,3.333,"[Warner Bros. Pictures, Other, Heyday Films]","[Adventure, Fantasy]"
4,674,Harry Potter and the Goblet of Fire,2005-11-16,244.428,7.8,16341,150000000,895921036,PG-13,3.234944,2.695,"[Warner Bros. Pictures, Heyday Films, Other]","[Adventure, Fantasy, Family]"


In [71]:
ml_table.to_csv('../Tables/Ml_Test_pgd_checked.csv')