In [37]:
#import general dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [38]:
#read in the data
#may need to request the data from a postgres database
df = pd.read_csv('../../Movie_Rec_Data/movies_merged.csv')

In [39]:
#check data shape, dtypes, etc.
df.shape

(45413, 36)

In [40]:
df.dtypes

index                      int64
adult                       bool
budget                     int64
id                         int64
imdb_id                    int64
original_language         object
popularity               float64
poster_path               object
release_date              object
revenue                    int64
runtime                  float64
status                    object
title                     object
video                       bool
vote_average             float64
vote_count                 int64
genre_action             float64
genre_adventure          float64
genre_animation          float64
genre_comedy             float64
genre_crime              float64
genre_documentary        float64
genre_drama              float64
genre_family             float64
genre_fantasy            float64
genre_foreign            float64
genre_history            float64
genre_horror             float64
genre_music              float64
genre_mystery            float64
genre_roma

In [41]:
df.head()

Unnamed: 0,index,adult,budget,id,imdb_id,original_language,popularity,poster_path,release_date,revenue,...,genre_history,genre_horror,genre_music,genre_mystery,genre_romance,genre_science_fiction,genre_tv_movie,genre_thriller,genre_war,genre_western
0,4,False,0,11862,113041,en,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76578911,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,False,35000000,9091,114576,en,5.23158,/eoWvKD60lT95Ss1MYNgVExpo5iU.jpg,1995-12-22,64350171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,9,False,58000000,710,113189,en,14.686036,/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg,1995-11-16,352194034,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,10,False,62000000,9087,112346,en,6.318445,/lymPNGLZgPHuqM29rKMGV46ANij.jpg,1995-11-17,107879496,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,11,False,0,12110,112896,en,5.430331,/xve4cgfYItnOhtzLYoTwTVy5FGr.jpg,1995-12-22,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
#drop unhelpful columns without too much exploration
del df['index']
del df['poster_path']

In [43]:
#finding all values in the video column that are true. presumably these are tv shows
tv_shows = df[df.video]

In [44]:
# turns out, most of these ranged from documentaries to video game cinematics. 
tv_shows[['video', 'title']]

Unnamed: 0,video,title
3542,True,Love Hina Spring Special - I Wish Your Dream
4627,True,Yes: 9012 live
4878,True,Tank on the Moon
4918,True,Step Up Love Story
5117,True,911 in Plane Site
...,...,...
44961,True,Devo: The Complete Truth About De-Evolution
45071,True,The Godfather Trilogy: 1972-1990
45261,True,Salad Fingers
45280,True,The End of a Vacation


In [45]:
# converting tv shows to an array of row numbers
tv_shows = tv_shows.index

In [46]:
# dropping rows that contain documentaries or other non movies
df.drop(tv_shows, axis=0)

Unnamed: 0,adult,budget,id,imdb_id,original_language,popularity,release_date,revenue,runtime,status,...,genre_history,genre_horror,genre_music,genre_mystery,genre_romance,genre_science_fiction,genre_tv_movie,genre_thriller,genre_war,genre_western
0,False,0,11862,113041,en,8.387519,1995-02-10,76578911,106.0,Released,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,35000000,9091,114576,en,5.231580,1995-12-22,64350171,106.0,Released,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,False,58000000,710,113189,en,14.686036,1995-11-16,352194034,130.0,Released,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,False,62000000,9087,112346,en,6.318445,1995-11-17,107879496,106.0,Released,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,False,0,12110,112896,en,5.430331,1995-12-22,0,88.0,Released,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45408,False,0,106807,135571,fr,0.225432,1900-01-01,0,2.0,Released,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45409,False,0,276895,3054038,en,0.011025,1981-01-01,0,58.0,Released,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45410,False,0,404604,5690142,hi,1.559596,2017-07-07,0,146.0,Released,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45411,False,0,30840,102797,en,5.683753,1991-05-13,0,104.0,Released,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df.drop(['id', 'imdb_id', 'release_date', 'video'], axis=1, inplace=True)

In [48]:
df.columns

Index(['adult', 'budget', 'original_language', 'popularity', 'revenue',
       'runtime', 'status', 'title', 'vote_average', 'vote_count',
       'genre_action', 'genre_adventure', 'genre_animation', 'genre_comedy',
       'genre_crime', 'genre_documentary', 'genre_drama', 'genre_family',
       'genre_fantasy', 'genre_foreign', 'genre_history', 'genre_horror',
       'genre_music', 'genre_mystery', 'genre_romance',
       'genre_science_fiction', 'genre_tv_movie', 'genre_thriller',
       'genre_war', 'genre_western'],
      dtype='object')

In [49]:
df.status.value_counts()

Released           44970
Rumored              227
Post Production       98
In Production         20
Planned               15
Canceled               2
Name: status, dtype: int64

In [50]:
df= df[df.status == 'Released']
df[['status','title']].head(20)

Unnamed: 0,status,title
0,Released,Father of the Bride Part II
1,Released,Sudden Death
2,Released,GoldenEye
3,Released,The American President
4,Released,Dracula: Dead and Loving It
5,Released,Ace Ventura: When Nature Calls
6,Released,Copycat
7,Released,Leaving Las Vegas
8,Released,Dangerous Minds
9,Released,Twelve Monkeys


In [51]:
df['adult'] = df['adult'].astype('object')

In [52]:
df

Unnamed: 0,adult,budget,original_language,popularity,revenue,runtime,status,title,vote_average,vote_count,...,genre_history,genre_horror,genre_music,genre_mystery,genre_romance,genre_science_fiction,genre_tv_movie,genre_thriller,genre_war,genre_western
0,False,0,en,8.387519,76578911,106.0,Released,Father of the Bride Part II,5.7,173,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,35000000,en,5.231580,64350171,106.0,Released,Sudden Death,5.5,174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,False,58000000,en,14.686036,352194034,130.0,Released,GoldenEye,6.6,1194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,False,62000000,en,6.318445,107879496,106.0,Released,The American President,6.5,199,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,False,0,en,5.430331,0,88.0,Released,Dracula: Dead and Loving It,5.7,210,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45408,False,0,fr,0.225432,0,2.0,Released,The Fat and Lean Wrestling Match,6.5,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45409,False,0,en,0.011025,0,58.0,Released,Deep Hearts,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45410,False,0,hi,1.559596,0,146.0,Released,Mom,6.6,14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45411,False,0,en,5.683753,0,104.0,Released,Robin Hood,5.7,26,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [53]:
df.columns

Index(['adult', 'budget', 'original_language', 'popularity', 'revenue',
       'runtime', 'status', 'title', 'vote_average', 'vote_count',
       'genre_action', 'genre_adventure', 'genre_animation', 'genre_comedy',
       'genre_crime', 'genre_documentary', 'genre_drama', 'genre_family',
       'genre_fantasy', 'genre_foreign', 'genre_history', 'genre_horror',
       'genre_music', 'genre_mystery', 'genre_romance',
       'genre_science_fiction', 'genre_tv_movie', 'genre_thriller',
       'genre_war', 'genre_western'],
      dtype='object')

In [54]:
df_encoded = pd.get_dummies(df, columns=['adult', 'original_language'])

In [55]:
df_encoded

Unnamed: 0,budget,popularity,revenue,runtime,status,title,vote_average,vote_count,genre_action,genre_adventure,...,original_language_tl,original_language_tr,original_language_uk,original_language_ur,original_language_uz,original_language_vi,original_language_wo,original_language_xx,original_language_zh,original_language_zu
0,0,8.387519,76578911,106.0,Released,Father of the Bride Part II,5.7,173,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,35000000,5.231580,64350171,106.0,Released,Sudden Death,5.5,174,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,58000000,14.686036,352194034,130.0,Released,GoldenEye,6.6,1194,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,62000000,6.318445,107879496,106.0,Released,The American President,6.5,199,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0,5.430331,0,88.0,Released,Dracula: Dead and Loving It,5.7,210,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45408,0,0.225432,0,2.0,Released,The Fat and Lean Wrestling Match,6.5,6,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
45409,0,0.011025,0,58.0,Released,Deep Hearts,0.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
45410,0,1.559596,0,146.0,Released,Mom,6.6,14,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
45411,0,5.683753,0,104.0,Released,Robin Hood,5.7,26,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
df_encoded.drop(['title', 'status'], axis=1, inplace=True)

In [57]:
df_encoded.isnull().count()

budget                  44970
popularity              44970
revenue                 44970
runtime                 44970
vote_average            44970
                        ...  
original_language_vi    44970
original_language_wo    44970
original_language_xx    44970
original_language_zh    44970
original_language_zu    44970
Length: 117, dtype: int64

In [58]:
df_encoded = df_encoded.fillna(0)

In [59]:
df_encoded.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,genre_action,genre_adventure,genre_animation,genre_comedy,...,original_language_tl,original_language_tr,original_language_uk,original_language_ur,original_language_uz,original_language_vi,original_language_wo,original_language_xx,original_language_zh,original_language_zu
0,0,8.387519,76578911,106.0,5.7,173,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,35000000,5.23158,64350171,106.0,5.5,174,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,58000000,14.686036,352194034,130.0,6.6,1194,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,62000000,6.318445,107879496,106.0,6.5,199,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,0,5.430331,0,88.0,5.7,210,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#using NearestNeighbors to help optimize our dbscans model
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(df_encoded)

distances, indices = nbrs.kneighbors()

In [None]:
# Plotting K-distance Graph to find our epsilon value
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(20,10))
plt.plot(distances)
plt.title('K-distance Graph',fontsize=20)
plt.xlabel('Data Points sorted by distance',fontsize=14)
plt.ylabel('Epsilon',fontsize=14)
plt.show()

In [None]:
#Potentially use sklearn's dbscan model to find outliers before clustering with kmeans
from sklearn.cluster import DBSCAN

db_model = DBSCAN(eps=0.75, min_samples=2, algorithm='auto')
db_model.fit(df_encoded)

In [None]:
# outliers dataframe
# outliers = df_encoded[db_model.labels_ == -1]
df_encoded['group'] = db_model.labels_

In [None]:
# Visualizing DBSCAN
import plotly.express as px

fig = px.scatter_3d(
    df_encoded,
    x='vote_count',
    y='popularity',
    z='vote_average',
    color='group',
    symbol='group'
)
fig.show()


In [60]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=4, max_iter=200, n_init=5, init_params='random').fit(df_encoded)

In [None]:
df_encoded['gm_weights'] = gm_means_

In [None]:
#import our dependencies
from sklearn.cluster import KMeans
#determine our clusters using an elbow plot
inertia = []
clusters = list(range(1,21))

# Calculate the inertia for the range of K values
for i in clusters:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_encoded)
    inertia.append(model.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"clusters": clusters, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="clusters", y="inertia", title="Elbow Curve", xticks=clusters)

In [None]:
#build function for running a Kmeans model
#instantiate the classifier
model = KMeans(n_clusters = 7, init='k-means++', max_iter=600, n_init=15)

#fit our scaled data
model = model.fit(df_encoded)

#make predictions
predictions = model.predict(df_encoded)

#create a column in our dataframe for our classifications
df_encoded["k_group"] = model.labels_

In [None]:
df_encoded.hvplot.scatter(x="revenue", y="popularity", by="k_group")

In [None]:
#analyze labels, see if we need to change clusters
df_encoded['k_group'].head(20)

In [None]:
df_encoded.columns

In [None]:
#plot our clusters
import plotly.express as px
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    df_encoded,
    x='revenue',
    y='popularity',
    z='vote_average',
    color='k_group',
    symbol='k_group'
)
fig.show()

In [None]:
df['k_group'] = model.labels_

In [None]:
df

In [None]:
df.to_csv('../Resources/movie_df_with_labels.csv')