In [3]:
import spacy
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [4]:
movie_data =  pd.read_csv('../dataset/movieDescriptionDataSet.tsv', sep='\t')
print (len(movie_data))

2982


In [5]:
movie_data.head()


Unnamed: 0,Title,Description,Genre,Rating,Link,Date
0,Aquaman,The film reveals the origin story of half-huma...,"ACTION,FANTASY,SCIENCE FICTION,ADVENTURE,ROMANCE",66.0,/movie/297802,"December 21, 2018"
1,Venom,Eddie Brock is a reporter—investigating people...,SCIENCE FICTION,65.0,/movie/335983,"October 5, 2018"
2,Mortal Engines,Set in a world many thousands of years in the ...,"SCIENCE FICTION,ACTION,ADVENTURE",63.0,/movie/428078,"December 14, 2018"
3,Fantastic Beasts: The Crimes of Grindelwald,Gellert Grindelwald has escaped imprisonment a...,"FAMILY,FANTASY,ADVENTURE",70.0,/movie/338952,"November 16, 2018"
4,Creed II,Follows Adonis Creed's life inside and outside...,"DRAMA,ACTION",70.0,/movie/480530,"November 21, 2018"


In [6]:
#Individual Data
movie_titles = movie_data['Title'].tolist();
movie_synopses = movie_data['Description'].tolist()
movie_genre = movie_data['Genre'].tolist()

In [7]:
#Normalization
from normalization import normalize_corpus
norm_movie_synopses , num_words_init, num_words_final= normalize_corpus(movie_synopses, 
                                                                        lemmatize=True, 
                                                                        only_text_chars=True)

In [9]:
genreSet = set()
genreCount ={}
i =0
for genre in movie_genre:
    
    genres = genre.split(',')
    
    for gen in genres:
        genreSet.add(gen)
        
        if genreCount.get(gen) == None:
            genreCount[gen] = 1
        else:
            genreCount[gen] = genreCount[gen] + 1
    i += 1
    
    
sorted_by_value = sorted(genreCount.items(), key=lambda kv: kv[1] , reverse=True)
genreDescriptionDf = pd.DataFrame.from_records(sorted_by_value, columns = ["Genre"," # Movies"])

genreDescriptionDf.head(30)

Unnamed: 0,Genre,# Movies
0,DRAMA,1194
1,COMEDY,915
2,THRILLER,899
3,ACTION,891
4,ADVENTURE,689
5,SCIENCE FICTION,451
6,CRIME,450
7,ROMANCE,439
8,FAMILY,437
9,FANTASY,435


In [11]:
summarizedData = [('Original', num_words_init),
         ('Final', num_words_final)]

df = pd.DataFrame.from_items(summarizedData)


df['Reduction %'] = 100 - df['Final'] * 100 / df['Original']
df['Reduction'] = df['Original'] - df['Final']

df.head(19)

#DATA REDUCTION

Unnamed: 0,Original,Final,Reduction %,Reduction
0,48,21,56.25,27
1,70,35,50.0,35
2,60,32,46.666667,28
3,93,52,44.086022,41
4,29,17,41.37931,12
5,72,38,47.222222,34
6,22,13,40.909091,9
7,32,19,40.625,13
8,56,32,42.857143,24
9,103,52,49.514563,51


In [12]:
totalOriginal = df['Original'].sum()
totalFinal = df['Final'].sum()

summarizedData = [(totalOriginal, totalFinal, 100 - totalFinal * 100 / totalOriginal , totalOriginal - totalFinal)]
summarizedDataLabels = ['Original', 'Final', 'Reduction %', 'Reduction']

df = pd.DataFrame.from_records(summarizedData, columns=summarizedDataLabels)

df.head()

Unnamed: 0,Original,Final,Reduction %,Reduction
0,144732,80163,44.612802,64569


In [13]:
#VECTORIZER
vectorizer = TfidfVectorizer(max_features=3000)
feature_matrix = vectorizer.fit_transform(norm_movie_synopses).astype(float)
feature_names = vectorizer.get_feature_names()

num_clusters = 19

print (feature_matrix.shape)
print (feature_names[:30])

(2982, 3000)
['1930s', '1940s', '1950s', '1960s', '1970s', '1980s', '19th', 'abandon', 'abduct', 'ability', 'able', 'aboard', 'abuse', 'academy', 'accept', 'accident', 'accidentally', 'accompany', 'accomplish', 'account', 'accuse', 'ace', 'achieve', 'across', 'act', 'action', 'activist', 'activity', 'actor', 'actress']


In [14]:
def k_means(feature_matrix, num_clusters=5):
    km = KMeans(n_clusters=num_clusters,
                max_iter=10000)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters

In [15]:
%matplotlib inline

In [16]:
km_obj, clusters = k_means(feature_matrix=feature_matrix,
                           num_clusters=num_clusters)
movie_data['Cluster'] = clusters
c = Counter(clusters)
print (c.items())

dict_items([(4, 117), (11, 486), (8, 169), (0, 107), (18, 103), (7, 146), (16, 97), (9, 245), (10, 96), (12, 86), (1, 289), (5, 265), (17, 64), (15, 97), (13, 146), (6, 173), (3, 93), (14, 186), (2, 17)])


In [17]:
def get_cluster_data(clustering_obj, movie_data, 
                     feature_names, num_clusters,
                     topn_features=10):

    cluster_details = {}  
    # get cluster centroids
    ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
    # get key features for each cluster
    # get movies belonging to each cluster
    for cluster_num in range(num_clusters):
        cluster_details[cluster_num] = {}
        
        cluster_details[cluster_num]['cluster_num'] = cluster_num
  
        key_features = [feature_names[index] 
                        for index 
                        in ordered_centroids[cluster_num, :topn_features]]
        
        
        movies = movie_data[movie_data['Cluster'] == cluster_num]['Title'].values.tolist()        
        genres = movie_data[movie_data['Cluster'] == cluster_num]['Genre'].values.tolist()
        
        cluster_details[cluster_num]['key_features'] = key_features
        cluster_details[cluster_num]['movies'] = movies
        cluster_details[cluster_num]['genres'] = genres
    
    return cluster_details
        
       
    
def print_cluster_data(cluster_data):
    # print cluster details
    for cluster_num, cluster_details in cluster_data.items():
        
        print ('Cluster {} details:'.format(cluster_num))
        print ('-'*20)
        print ('Key features: (', cluster_details['key_features'] , ')')
        print ('Movies in this cluster: (', len(cluster_details['movies']) , ')')
        print (', '.join(cluster_details['movies'][:30]))
        
        print ('Movie genres:')
        
        s = list()
        for genre in cluster_details['genres']:
            #print (genre.split(','))
            for aa in genre.split(','):
                s.append(aa)
                
        counter=Counter(s)  
        
        #print ('|| '.join(cluster_details['genres']))
        print ('-'*20)
        #print (s)
        print(counter)
        #print (', '.join(s))
        
        print ('='*40)

        
def print_cluster_data_report(cluster_data):
    # print cluster details
    clusterDetail = []
    clusterDetail2 = []
    for cluster_num, cluster_details in cluster_data.items():
        
        clusterDict= {}
        #clusterDict['# MOVIES'] = len(cluster_details['movies'])
        
        for nn in genreSet:
            clusterDict[nn] = 0
        clusterDetail.append(clusterDict)
        
        s = list()
        for genreList in cluster_details['genres']:
            #print (genre.split(','))
            for genre in genreList.split(','):
                #Adding Genres
                clusterDict[genre] = clusterDict[genre] + 1
                
        clusterDict2 = {}
        ff = sorted(clusterDict.items(), key=lambda kv: kv[1] , reverse=True)
        
        
        
        clusterDetail2.append(ff[0] + ff[1] + (len(cluster_details['movies']),(cluster_num +1),cluster_details['key_features']))
        
        
        
    #print ( clusterDetail)
    df = pd.DataFrame(clusterDetail)
    df = df[['DRAMA', 'COMEDY', 'THRILLER', 'ACTION', 'ADVENTURE', 'SCIENCE FICTION', 'CRIME', 'ROMANCE', 'FAMILY', 'FANTASY', 'HORROR', 'ANIMATION', 'MYSTERY', 'HISTORY', 'WAR', 'MUSIC', 'WESTERN', 'TV MOVIE', 'DOCUMENTARY']]

    
    df2 = pd.DataFrame(clusterDetail2 , columns =["Genre 1" ,"Freq 1", "Genre 2" ,"Freq 2","# Movies", "Cluster","Key Features"])
    
    
    return df , df2



cluster_data =  get_cluster_data(clustering_obj=km_obj,
                                 movie_data=movie_data,
                                 feature_names=feature_names,
                                 num_clusters=num_clusters,
                                 topn_features=5)         

df , df2 = print_cluster_data_report(cluster_data) 



df.head(20)


Unnamed: 0,DRAMA,COMEDY,THRILLER,ACTION,ADVENTURE,SCIENCE FICTION,CRIME,ROMANCE,FAMILY,FANTASY,HORROR,ANIMATION,MYSTERY,HISTORY,WAR,MUSIC,WESTERN,TV MOVIE,DOCUMENTARY
0,50,34,21,14,30,10,11,20,29,22,9,14,7,0,2,3,0,2,1
1,117,89,112,130,49,30,113,20,30,28,23,35,31,8,3,7,3,1,2
2,6,4,4,1,2,0,0,1,3,1,1,0,0,0,0,0,0,0,3
3,38,17,30,37,26,44,1,16,4,9,5,8,5,7,0,0,4,0,0
4,41,34,42,47,38,24,12,13,4,7,7,8,9,7,4,2,3,0,2
5,94,56,142,162,72,44,54,8,22,12,16,15,32,27,34,5,8,2,1
6,85,75,46,34,22,14,38,31,19,14,16,10,18,0,1,8,3,1,0
7,42,35,39,73,60,33,20,10,25,45,22,19,12,6,4,2,6,0,0
8,97,53,42,32,31,14,16,40,28,20,12,21,16,4,2,5,1,3,2
9,55,61,46,127,110,114,12,15,49,51,18,50,14,14,19,5,2,3,2


In [18]:

df2['% Most Relevant'] = df2['Freq 1'] * 100 / df2['# Movies']
df2['% Second Most Relevant'] = df2['Freq 2'] * 100 / df2['# Movies']


df2 = df2[["Genre 1","Genre 2","Cluster","# Movies" , "Freq 1", "Freq 2", '% Most Relevant','% Second Most Relevant',"Key Features"]]
#df2 = df2.sort_index(axis=1)
df2 = df2.sort_values('% Most Relevant',ascending=False)

df2.head(20)

Unnamed: 0,Genre 1,Genre 2,Cluster,# Movies,Freq 1,Freq 2,% Most Relevant,% Second Most Relevant,Key Features
15,THRILLER,CRIME,16,97,70,49,72.164948,50.515464,"[murder, investigate, detective, killer, case]"
13,DRAMA,COMEDY,14,146,94,35,64.383562,23.972603,"[story, true, base, life, love]"
5,ACTION,THRILLER,6,265,162,142,61.132075,53.584906,"[mission, agent, team, american, kill]"
17,COMEDY,FAMILY,18,64,39,33,60.9375,51.5625,"[christmas, santa, eve, claus, holiday]"
16,COMEDY,DRAMA,17,97,56,39,57.731959,40.206186,"[school, high, student, friend, teacher]"
8,DRAMA,COMEDY,9,169,97,53,57.39645,31.360947,"[life, change, family, new, live]"
9,ACTION,SCIENCE FICTION,10,245,127,114,51.836735,46.530612,"[world, planet, war, earth, alien]"
7,ACTION,ADVENTURE,8,146,73,60,50.0,41.09589,"[face, vampire, time, must, race]"
6,DRAMA,COMEDY,7,173,85,75,49.132948,43.352601,"[old, year, friend, best, ex]"
10,DRAMA,THRILLER,11,96,47,31,48.958333,32.291667,"[town, small, new, mother, family]"


In [214]:
#plot_clusters(num_clusters=num_clusters, 
#              feature_matrix=feature_matrix,
#              cluster_data=cluster_data, 
#              movie_data=movie_data,
#              plot_size=(16,8))  

In [302]:
from sklearn.cluster import AffinityPropagation
              
def affinity_propagation(feature_matrix):
    
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_          
    return ap, clusters

In [303]:
ap_obj, clusters = affinity_propagation(feature_matrix=feature_matrix)
movie_data['Cluster'] = clusters

In [304]:
c = Counter(clusters)

In [305]:
total_clusters = len(c)
print ('Total Clusters:', total_clusters)

Total Clusters: 234


In [308]:
cluster_data =  get_cluster_data(clustering_obj=ap_obj,
                                 movie_data=movie_data,
                                 feature_names=feature_names,
                                 num_clusters=total_clusters,
                                 topn_features=5)         

df , df2 = print_cluster_data2(cluster_data) 


df = df[['DRAMA', 'COMEDY', 'THRILLER', 'ACTION', 'ADVENTURE', 'SCIENCE FICTION', 'CRIME', 'ROMANCE', 'FAMILY', 'FANTASY', 'HORROR', 'ANIMATION', 'MYSTERY', 'HISTORY', 'WAR', 'MUSIC', 'WESTERN', 'TV MOVIE', 'DOCUMENTARY']]

df.head(20)

Unnamed: 0,DRAMA,COMEDY,THRILLER,ACTION,ADVENTURE,SCIENCE FICTION,CRIME,ROMANCE,FAMILY,FANTASY,HORROR,ANIMATION,MYSTERY,HISTORY,WAR,MUSIC,WESTERN,TV MOVIE,DOCUMENTARY
0,2,4,1,7,6,3,0,1,0,4,1,1,0,0,0,0,0,0,0
1,2,0,8,7,8,1,0,0,0,0,1,0,0,1,1,0,1,0,0
2,1,9,0,0,1,0,1,1,9,4,0,6,1,0,0,1,0,1,0
3,1,2,0,0,1,0,0,0,1,0,0,1,0,1,0,0,3,0,0
4,24,12,19,12,9,8,7,10,6,2,2,6,4,0,0,1,1,3,0
5,1,1,1,8,7,6,0,1,1,5,1,1,0,0,0,0,0,0,0
6,2,1,7,8,7,12,0,1,1,0,2,2,0,0,0,0,0,0,0
7,5,2,4,5,5,4,1,1,0,1,1,0,0,1,0,0,0,0,0
8,1,0,2,1,3,1,0,1,1,0,0,1,0,0,0,0,0,0,0
9,1,4,2,2,4,3,0,1,0,0,2,0,0,0,0,0,0,0,0


In [319]:
df2['% Most Relevant'] = df2['Freq'] * 100 / df2['# Movies']

df2 = df2[["Genre","Cluster","# Movies" , "Freq", '% Most Relevant',"Key Features"]]
#df2 = df2.sort_index(axis=1)
df2 = df2.sort_values('% Most Relevant',ascending=False)

df2.head(50)

Unnamed: 0,Genre,Cluster,# Movies,Freq,% Most Relevant,Key Features
121,FAMILY,122,6,6,100.0,"[inventor, involved, marcus, impending, sullivan]"
106,DRAMA,107,6,6,100.0,"[hack, ominous, roger, explode, manhunt]"
136,ADVENTURE,137,1,1,100.0,"[boxing, embrace, country, hacker, assassin]"
30,ACTION,31,4,4,100.0,"[check, colorado, award, egg, lie]"
226,ACTION,227,3,3,100.0,"[wade, derek, lifetime, sorcery, accidentally]"
129,COMEDY,130,2,2,100.0,"[lilo, square, share, sense, program]"
139,COMEDY,140,3,3,100.0,"[mastermind, sniper, tiger, nazi, lifetime]"
3,WESTERN,4,3,3,100.0,"[almost, ray, locate, idyllic, joes]"
25,ADVENTURE,26,5,5,100.0,"[brilliant, depression, film, christine, intend]"
65,ADVENTURE,66,6,6,100.0,"[disease, and, dragon, angel, surgeon]"


In [320]:

df2.tail(50)

Unnamed: 0,Genre,Cluster,# Movies,Freq,% Most Relevant,Key Features
177,DRAMA,178,9,5,55.555556,"[protect, trio, sorcery, square, goo]"
58,ACTION,59,20,11,55.0,"[deranged, truth, bilbo, expose, gift]"
148,DRAMA,149,11,6,54.545455,"[mystery, create, accidentally, transformation..."
197,DRAMA,198,11,6,54.545455,"[shape, toward, serious, intergalactic, commit..."
72,ACTION,73,11,6,54.545455,"[dystopian, competitive, agent, league, clash]"
175,THRILLER,176,11,6,54.545455,"[profound, pawn, slip, nice, value]"
40,ADVENTURE,41,13,7,53.846154,"[conquer, civil, forced, whereabouts, cousin]"
55,ADVENTURE,56,15,8,53.333333,"[debt, gift, brand, superhero, castle]"
128,ADVENTURE,129,15,8,53.333333,"[letter, boyfriend, korean, pole, soul]"
76,ADVENTURE,77,17,9,52.941176,"[edward, blade, emily, unstable, gu]"
