# Goals:
- Global Goal: Study characteristics of successful movies in the US
- Local Goal: Create a clean dataset of movies from the MySQL database that can be used for clustering and later ML models

# Deliverables
- Movies dataset with clusters for analysis

# Imports

In [70]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import json
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from sqlalchemy.types import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer

# MySQL Connection

In [4]:
# Load MySQL credentials
with open('/Users/coire/.secret/MySQL.json') as f:   #use your path here!
    login = json.load(f)
PWD = login['PWD']

In [5]:
connection = f'mysql+pymysql://root:{PWD}@localhost/movies'

In [8]:
engine = create_engine(connection)

In [9]:
# Check if the database exists. If not, create it.
if database_exists(connection) == False:
  create_database(connection)
else:
  print('The database already exists')

The database already exists


In [10]:
q = '''
SHOW TABLES; '''

pd.read_sql(q, engine)

Unnamed: 0,Tables_in_movies
0,genres
1,title_basics
2,title_genres
3,title_ratings
4,tmdb_data


# create full dataset

In [83]:
tmdb_df = pd.read_csv('Data/tmdb_results_combined.csv.gz',
                          lineterminator='\n')
basics_df = pd.read_csv('Data/basics.csv.gz')
akas_df = pd.read_csv('Data/akas.csv.gz')
ratings_df = pd.read_csv('Data/ratings.csv.gz')

In [84]:
basics_df.drop(columns=['Unnamed: 0', 'originalTitle', 'isAdult', 'titleType'], inplace=True)

In [85]:
tmdb_df = tmdb_df[['imdb_id', 'revenue', 'budget', 'certification']]

In [86]:
tmdb_df.rename(columns={'imdb_id':'tconst'}, inplace=True)

In [87]:
ratings_df.drop(columns='Unnamed: 0', inplace=True)

## Multilabelbinarization of Genres

In [88]:
# Get a list of unique genres
basics_df['genres'] = basics_df['genres'].str.split(',')

In [89]:
basics_df.head()

Unnamed: 0,tconst,primaryTitle,startYear,endYear,runtimeMinutes,genres
0,tt0035423,Kate & Leopold,2001.0,,118,"[Comedy, Fantasy, Romance]"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,,70,[Drama]
2,tt0069049,The Other Side of the Wind,2018.0,,122,[Drama]
3,tt0088751,The Naked Monster,2005.0,,100,"[Comedy, Horror, Sci-Fi]"
4,tt0096056,Crime and Punishment,2002.0,,126,[Drama]


In [90]:
mlb = MultiLabelBinarizer()

In [91]:
basics_df = basics_df.join(pd.DataFrame(mlb.fit_transform(basics_df.pop('genres'))
                                               ,index=basics_df.index
                                               ,columns=mlb.classes_))

In [92]:
basics_df

Unnamed: 0,tconst,primaryTitle,startYear,endYear,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,tt0035423,Kate & Leopold,2001.0,,118,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,,70,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0069049,The Other Side of the Wind,2018.0,,122,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt0088751,The Naked Monster,2005.0,,100,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,tt0096056,Crime and Punishment,2002.0,,126,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82692,tt9914942,Life Without Sara Amat,2019.0,,74,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82693,tt9915872,The Last White Witch,2019.0,,97,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82694,tt9916170,The Rehearsal,2019.0,,51,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82695,tt9916190,Safeguard,2020.0,,95,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


## Combining dataframes

In [93]:
basics_df.set_index('tconst', inplace=True)

In [94]:
tmdb_df.set_index('tconst', inplace=True)

In [95]:
akas_df.rename(columns={'titleId':'tconst'}, inplace=True)

In [96]:
akas_df.drop(columns=['Unnamed: 0'], inplace=True)

In [97]:
akas_df.set_index('tconst', inplace=True)

In [98]:
ratings_df.set_index('tconst', inplace=True)

In [99]:
df = basics_df.join(tmdb_df, how='inner')

In [100]:
#df = df.join(akas_df, how='inner')

In [101]:
df = df.join(ratings_df, how='inner')

In [102]:
df.head()

Unnamed: 0_level_0,primaryTitle,startYear,endYear,runtimeMinutes,Action,Adult,Adventure,Animation,Biography,Comedy,...,Sport,Talk-Show,Thriller,War,Western,revenue,budget,certification,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0035423,Kate & Leopold,2001.0,,118,0,0,0,0,0,1,...,0,0,0,0,0,76019048.0,48000000.0,PG-13,6.4,84809
tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,,70,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,,6.4,161
tt0069049,The Other Side of the Wind,2018.0,,122,0,0,0,0,0,0,...,0,0,0,0,0,0.0,12000000.0,R,6.7,7374
tt0088751,The Naked Monster,2005.0,,100,0,0,0,0,0,1,...,0,0,0,0,0,0.0,350000.0,,5.3,327
tt0096056,Crime and Punishment,2002.0,,126,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,,5.6,821


# Cleaning

In [103]:
df.drop(columns=['endYear'], inplace=True)

In [104]:
report = ProfileReport(df)
report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## Duplicates