In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter


In [3]:
# DB libraries
import psycopg2
from postgres_data import host
from postgres_data import password
from postgres_data import user
from sqlalchemy import create_engine

In [4]:
# file_path = Path('../Tables/meta_ml.csv')
# df = pd.read_csv(file_path)[:-2]

In [4]:
# ML libraries
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [5]:
# Create an engine instance
alchemyEngine = create_engine(f'postgresql://{user}:{password}@{host}:5432/dfph644rcv92i9')

In [6]:
# Connect to PostgreSQL server
dbConnection = alchemyEngine.connect()

# Read data from PostgreSQL database table and load into a DataFrame instance
df = pd.read_sql("select * from meta", dbConnection)
dbConnection.close()

In [7]:
ml_table = df[['movie_id','title','release_date','movie_popularity', 
            'vote_average', 'vote_count','budget','revenue', 
            'ratings','studios','genres','actor_popularity_mean',
            'director_popularity_mean']]
ml_table

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,studios,genres,actor_popularity_mean,director_popularity_mean
0,244,King Kong,1933-03-15,19.367,7.6,1017,672000,10000000,NR,['RKO Radio Pictures'],"['Adventure', 'Horror', 'Science Fiction']",1.102000,1.3365
1,408,Snow White and the Seven Dwarfs,1938-02-04,76.361,7.1,5935,1488423,184925486,G,['Walt Disney Productions'],"['Fantasy', 'Animation', 'Family']",1.185000,1.3800
2,630,The Wizard of Oz,1939-08-15,40.364,7.6,4340,2777000,33754967,G,['Metro-Goldwyn-Mayer'],"['Adventure', 'Fantasy', 'Family']",1.233611,1.2040
3,770,Gone with the Wind,1940-02-16,23.816,8.0,3115,4000000,402352579,G,"['Other', ' Metro-Goldwyn-Mayer']","['Drama', 'War', 'Romance']",1.045947,1.2040
4,10895,Pinocchio,1940-02-23,82.431,7.1,4632,2600000,84300000,G,"['Walt Disney Productions', ' RKO Radio Pictur...","['Animation', 'Family']",1.116071,1.5470
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1900,576845,Last Night in Soho,2021-10-21,772.004,7.5,615,43000000,19000000,R,"['Focus Features', ' Film4 Productions', ' Wor...","['Horror', 'Mystery', 'Thriller']",2.733038,6.8020
1901,524434,Eternals,2021-11-03,1339.598,7.1,1293,200000000,368000000,PG-13,['Marvel Studios'],"['Action', 'Adventure', 'Science Fiction', 'Dr...",4.528703,2.5580
1902,425909,Ghostbusters: Afterlife,2021-11-11,394.382,7.5,197,75000000,115000000,PG-13,"['Columbia Pictures', ' Bron Studios', ' The M...","['Comedy', 'Fantasy', 'Adventure', 'Science Fi...",4.422395,3.2570
1903,460458,Resident Evil: Welcome to Raccoon City,2021-11-24,449.280,6.0,77,40000000,5500000,R,"['Constantin Film', ' Other', ' Davis Films', ...","['Horror', 'Action', 'Science Fiction']",3.155138,2.7540


In [8]:
studio_split = []
for studio in ml_table['studios']:
    x = str(studio)[1:-1]
    studio_split.append(x)

studio_array = []
for studio in studio_split:
    x = studio.split(',')
    studio_array.append(x)


ml_table.drop(['studios'],axis=1,inplace=True)
ml_table['studios'] = studio_array

genre_split = []
for genre in ml_table['genres']:
    x = str(genre)[1:-1]
    genre_split.append(x)
genre_split

genre_array=[]
for genre in genre_split:
    x = genre.split(',')
    genre_array.append(x)

ml_table.drop(['genres'],axis=1,inplace=True)
ml_table['genres'] = genre_array

ml_table = ml_table.reindex(columns=['movie_id','title','release_date','movie_popularity', 
            'vote_average', 'vote_count','budget','revenue', 
            'ratings','studios','genres','actor_popularity_mean',
            'director_popularity_mean'])


In [9]:
studio_revised = []
for test in ml_table['studios']:
    holder = []
    for studio in test:
        word = studio.replace('\'','')
        word = word.replace(' ','')
        if word not in holder:
            holder.append(word)
        holder.sort()
    studio_revised.append(holder)

genre_revised = []
for test in ml_table['genres']:
    holder = []
    for genre in test:
        word = genre.replace('\'','')
        word = word.replace(' ','')
        if (word not in holder and word != ' Other'):
            holder.append(word)
        holder.sort()
    genre_revised.append(holder)   

ml_table.drop(['studios'],axis=1,inplace=True)
ml_table['studios'] = studio_revised

ml_table.drop(['genres'],axis=1,inplace=True)
ml_table['genres'] = genre_revised

ml_table.head(10)

Unnamed: 0,movie_id,title,release_date,movie_popularity,vote_average,vote_count,budget,revenue,ratings,actor_popularity_mean,director_popularity_mean,studios,genres
0,244,King Kong,1933-03-15,19.367,7.6,1017,672000,10000000,NR,1.102,1.3365,[RKORadioPictures],"[Adventure, Horror, ScienceFiction]"
1,408,Snow White and the Seven Dwarfs,1938-02-04,76.361,7.1,5935,1488423,184925486,G,1.185,1.38,[WaltDisneyProductions],"[Animation, Family, Fantasy]"
2,630,The Wizard of Oz,1939-08-15,40.364,7.6,4340,2777000,33754967,G,1.233611,1.204,[Metro-Goldwyn-Mayer],"[Adventure, Family, Fantasy]"
3,770,Gone with the Wind,1940-02-16,23.816,8.0,3115,4000000,402352579,G,1.045947,1.204,"[Metro-Goldwyn-Mayer, Other]","[Drama, Romance, War]"
4,10895,Pinocchio,1940-02-23,82.431,7.1,4632,2600000,84300000,G,1.116071,1.547,"[RKORadioPictures, WaltDisneyProductions]","[Animation, Family]"
5,11360,Dumbo,1941-10-31,60.442,7.0,3957,812000,1600000,G,1.088143,1.885,[WaltDisneyProductions],"[Animation, Family]"
6,3170,Bambi,1942-08-14,68.742,7.0,4487,858000,267447150,G,1.442786,1.38,[WaltDisneyProductions],"[Animation, Drama, Family]"
7,289,Casablanca,1942-11-26,18.97,8.2,3989,878000,10462500,PG,0.998286,1.683,[WarnerBros.Pictures],"[Drama, Romance]"
8,1585,It's a Wonderful Life,1946-12-20,34.456,8.3,3101,3180000,9644124,PG,1.017784,2.223,"[Other, RKORadioPictures]","[Drama, Family, Fantasy]"
9,11224,Cinderella,1950-02-22,82.937,7.0,5421,2900000,263591415,G,1.354783,1.252333,"[RKORadioPictures, WaltDisneyPictures, WaltDis...","[Animation, Family, Fantasy, Romance]"


In [10]:
ml_table.to_csv('../Tables/Ml_Test_pgs_checked.csv', index = False)