In [3]:
import spacy
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [4]:
movie_data =  pd.read_csv('../dataset/movieDescriptionDataSet.tsv', sep='\t')
print (len(movie_data))

2982


In [5]:
movie_data.head()


Unnamed: 0,Title,Description,Genre,Rating,Link,Date
0,Aquaman,The film reveals the origin story of half-huma...,"ACTION,FANTASY,SCIENCE FICTION,ADVENTURE,ROMANCE",66.0,/movie/297802,"December 21, 2018"
1,Venom,Eddie Brock is a reporter—investigating people...,SCIENCE FICTION,65.0,/movie/335983,"October 5, 2018"
2,Mortal Engines,Set in a world many thousands of years in the ...,"SCIENCE FICTION,ACTION,ADVENTURE",63.0,/movie/428078,"December 14, 2018"
3,Fantastic Beasts: The Crimes of Grindelwald,Gellert Grindelwald has escaped imprisonment a...,"FAMILY,FANTASY,ADVENTURE",70.0,/movie/338952,"November 16, 2018"
4,Creed II,Follows Adonis Creed's life inside and outside...,"DRAMA,ACTION",70.0,/movie/480530,"November 21, 2018"


In [6]:
#Individual Data
movie_titles = movie_data['Title'].tolist();
movie_synopses = movie_data['Description'].tolist()
movie_genre = movie_data['Genre'].tolist()

In [7]:
#Normalization
from normalization import normalize_corpus
norm_movie_synopses , num_words_init, num_words_final= normalize_corpus(movie_synopses, 
                                                                        lemmatize=True, 
                                                                        only_text_chars=True)

In [9]:
genreSet = set()
genreCount ={}
i =0
for genre in movie_genre:
    
    genres = genre.split(',')
    
    for gen in genres:
        genreSet.add(gen)
        
        if genreCount.get(gen) == None:
            genreCount[gen] = 1
        else:
            genreCount[gen] = genreCount[gen] + 1
    i += 1
    
    
sorted_by_value = sorted(genreCount.items(), key=lambda kv: kv[1] , reverse=True)
genreDescriptionDf = pd.DataFrame.from_records(sorted_by_value, columns = ["Genre"," # Movies"])

genreDescriptionDf.head(30)

Unnamed: 0,Genre,# Movies
0,DRAMA,1194
1,COMEDY,915
2,THRILLER,899
3,ACTION,891
4,ADVENTURE,689
5,SCIENCE FICTION,451
6,CRIME,450
7,ROMANCE,439
8,FAMILY,437
9,FANTASY,435


In [31]:
summarizedData = [('Original', num_words_init),
         ('Final', num_words_final)]

df = pd.DataFrame.from_items(summarizedData)


df['Reduction %'] = 100 - df['Final'] * 100 / df['Original']
df['Reduction'] = df['Original'] - df['Final']

df.head(20)

#DATA REDUCTION

Unnamed: 0,Original,Final,Reduction %,Reduction
0,48,21,56.25,27
1,70,35,50.0,35
2,60,32,46.666667,28
3,93,52,44.086022,41
4,29,17,41.37931,12
5,72,38,47.222222,34
6,22,13,40.909091,9
7,32,19,40.625,13
8,56,32,42.857143,24
9,103,52,49.514563,51


In [32]:
totalOriginal = df['Original'].sum()
totalFinal = df['Final'].sum()

summarizedData = [(totalOriginal, totalFinal, 100 - totalFinal * 100 / totalOriginal , totalOriginal - totalFinal)]
summarizedDataLabels = ['Original', 'Final', 'Reduction %', 'Reduction']

df = pd.DataFrame.from_records(summarizedData, columns=summarizedDataLabels)

df.head()

Unnamed: 0,Original,Final,Reduction %,Reduction
0,144732,80163,44.612802,64569


In [58]:
#VECTORIZER
vectorizer = TfidfVectorizer(max_features=5000)
feature_matrix = vectorizer.fit_transform(norm_movie_synopses).astype(float)
feature_names = vectorizer.get_feature_names()

num_clusters = 19

print (feature_matrix.shape)
print (feature_names[:30])

(2982, 5000)
['14th', '17th', '1890s', '18th', '1920s', '1930s', '1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '19th', '20th', '21st', '22nd', '50s', '60s', 'aaron', 'abandon', 'abandoned', 'abbey', 'abduct', 'abigail', 'ability', 'able', 'aboard', 'abroad', 'absence', 'absent']


In [56]:
def k_means(feature_matrix, num_clusters=5):
    km = KMeans(n_clusters=num_clusters,
                max_iter=10000)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters

In [15]:
%matplotlib inline

In [77]:
km_obj, clusters = k_means(feature_matrix=feature_matrix,
                           num_clusters=num_clusters)
movie_data['Cluster'] = clusters
c = Counter(clusters)
print (c.items())

dict_items([(16, 96), (0, 651), (12, 119), (1, 171), (6, 140), (10, 190), (4, 144), (15, 118), (2, 122), (18, 331), (8, 62), (9, 71), (17, 136), (7, 178), (11, 151), (14, 96), (13, 149), (5, 17), (3, 40)])


In [63]:
def get_cluster_data(clustering_obj, movie_data, 
                     feature_names, num_clusters,
                     topn_features=10):

    cluster_details = {}  
    # get cluster centroids
    ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
    # get key features for each cluster
    # get movies belonging to each cluster
    for cluster_num in range(num_clusters):
        cluster_details[cluster_num] = {}
        
        cluster_details[cluster_num]['cluster_num'] = cluster_num
  
        key_features = [feature_names[index] 
                        for index 
                        in ordered_centroids[cluster_num, :topn_features]]
        
        
        movies = movie_data[movie_data['Cluster'] == cluster_num]['Title'].values.tolist()        
        genres = movie_data[movie_data['Cluster'] == cluster_num]['Genre'].values.tolist()
        
        cluster_details[cluster_num]['key_features'] = key_features
        cluster_details[cluster_num]['movies'] = movies
        cluster_details[cluster_num]['genres'] = genres
    
    return cluster_details
        
       
    
def print_cluster_data(cluster_data):
    # print cluster details
    for cluster_num, cluster_details in cluster_data.items():
        
        print ('Cluster {} details:'.format(cluster_num))
        print ('-'*20)
        print ('Key features: (', cluster_details['key_features'] , ')')
        print ('Movies in this cluster: (', len(cluster_details['movies']) , ')')
        print (', '.join(cluster_details['movies'][:30]))
        
        print ('Movie genres:')
        
        s = list()
        for genre in cluster_details['genres']:
            #print (genre.split(','))
            for aa in genre.split(','):
                s.append(aa)
                
        counter=Counter(s)  
        
        #print ('|| '.join(cluster_details['genres']))
        print ('-'*20)
        #print (s)
        print(counter)
        #print (', '.join(s))
        
        print ('='*40)

        
def print_cluster_data_report(cluster_data):
    # print cluster details
    clusterDetail = []
    clusterDetail2 = []
    for cluster_num, cluster_details in cluster_data.items():
        
        clusterDict= {}
        #clusterDict['# MOVIES'] = len(cluster_details['movies'])
        
        for nn in genreSet:
            clusterDict[nn] = 0
        clusterDetail.append(clusterDict)
        
        s = list()
        for genreList in cluster_details['genres']:
            #print (genre.split(','))
            for genre in genreList.split(','):
                #Adding Genres
                clusterDict[genre] = clusterDict[genre] + 1
                
        clusterDict2 = {}
        ff = sorted(clusterDict.items(), key=lambda kv: kv[1] , reverse=True)
        
        
        
        clusterDetail2.append(ff[0] + ff[1] + (len(cluster_details['movies']),(cluster_num +1),cluster_details['key_features']))
        
        
        
    #print ( clusterDetail)
    df = pd.DataFrame(clusterDetail)
    df = df[['DRAMA', 'COMEDY', 'THRILLER', 'ACTION', 'ADVENTURE', 'SCIENCE FICTION', 'CRIME', 'ROMANCE', 'FAMILY', 'FANTASY', 'HORROR', 'ANIMATION', 'MYSTERY', 'HISTORY', 'WAR', 'MUSIC', 'WESTERN', 'TV MOVIE', 'DOCUMENTARY']]

    
    df2 = pd.DataFrame(clusterDetail2 , columns =["Genre 1" ,"Freq 1", "Genre 2" ,"Freq 2","# Movies", "Cluster","Key Features"])
    
    df2['% Most Relevant'] = df2['Freq 1'] * 100 / df2['# Movies']
    df2['% Second Most Relevant'] = df2['Freq 2'] * 100 / df2['# Movies']


    df2 = df2[["Genre 1","Genre 2","Cluster","# Movies" , "Freq 1", "Freq 2", '% Most Relevant','% Second Most Relevant',"Key Features"]]
    #df2 = df2.sort_index(axis=1)
    df2 = df2.sort_values('% Most Relevant',ascending=False)
    
    return df , df2



cluster_data =  get_cluster_data(clustering_obj=km_obj,
                                 movie_data=movie_data,
                                 feature_names=feature_names,
                                 num_clusters=num_clusters,
                                 topn_features=5)         

df , df2 = print_cluster_data_report(cluster_data) 



df.head(20)


Unnamed: 0,DRAMA,COMEDY,THRILLER,ACTION,ADVENTURE,SCIENCE FICTION,CRIME,ROMANCE,FAMILY,FANTASY,HORROR,ANIMATION,MYSTERY,HISTORY,WAR,MUSIC,WESTERN,TV MOVIE,DOCUMENTARY
0,112,67,65,131,98,51,18,19,30,67,29,33,14,40,38,3,9,2,0
1,97,73,16,13,24,14,8,87,30,27,8,24,10,5,4,8,0,1,1
2,101,48,166,171,56,37,110,8,8,17,14,8,37,14,12,1,9,1,0
3,77,46,51,22,21,10,23,17,28,18,38,14,19,5,0,3,3,2,1
4,50,65,20,18,19,7,10,33,20,13,10,11,6,1,0,11,0,3,0
5,60,40,40,17,8,12,15,46,7,10,18,5,11,1,2,3,3,1,0
6,141,201,100,74,76,34,50,61,85,51,52,59,41,6,4,25,4,7,9
7,56,49,32,44,72,26,7,19,55,59,23,43,12,5,4,3,4,2,0
8,6,4,4,1,2,0,0,1,3,1,1,0,0,0,0,0,0,0,3
9,35,22,14,15,4,10,11,9,2,4,8,5,10,7,3,3,1,0,3


In [64]:
df2.head(20)

Unnamed: 0,Genre 1,Genre 2,Cluster,# Movies,Freq 1,Freq 2,% Most Relevant,% Second Most Relevant,Key Features
17,SCIENCE FICTION,ACTION,18,142,106,84,74.647887,59.15493,"[earth, planet, alien, space, race]"
13,DRAMA,THRILLER,14,115,84,23,73.043478,20.0,"[story, true, base, life, man]"
16,COMEDY,FAMILY,17,47,29,29,61.702128,61.702128,"[christmas, santa, eve, claus, pole]"
2,ACTION,THRILLER,3,279,171,166,61.290323,59.498208,"[agent, team, criminal, crime, prison]"
15,THRILLER,CRIME,16,142,87,77,61.267606,54.225352,"[murder, detective, investigate, los, angeles]"
1,DRAMA,ROMANCE,2,170,97,87,57.058824,51.176471,"[love, life, fall, meet, beautiful]"
14,ADVENTURE,ACTION,15,135,77,54,57.037037,40.0,"[embark, journey, world, evil, must]"
4,COMEDY,DRAMA,5,121,65,50,53.719008,41.322314,"[school, high, girl, student, friend]"
3,DRAMA,THRILLER,4,152,77,51,50.657895,33.552632,"[family, home, life, old, year]"
11,DRAMA,THRILLER,12,85,43,32,50.588235,37.647059,"[daughter, father, wife, teenage, save]"


In [65]:
from sklearn.cluster import AffinityPropagation
              
def affinity_propagation(feature_matrix):
    
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_          
    return ap, clusters

In [66]:
ap_obj, clusters = affinity_propagation(feature_matrix=feature_matrix)
movie_data['Cluster'] = clusters

In [67]:
c = Counter(clusters)
total_clusters = len(c)
print ('Total Clusters:', total_clusters)

Total Clusters: 275


In [68]:
cluster_data =  get_cluster_data(clustering_obj=ap_obj,
                                 movie_data=movie_data,
                                 feature_names=feature_names,
                                 num_clusters=total_clusters,
                                 topn_features=5)         

df_afinity_propagation , df2_afinity_propagation = print_cluster_data_report(cluster_data) 



df_afinity_propagation.head(20)

Unnamed: 0,DRAMA,COMEDY,THRILLER,ACTION,ADVENTURE,SCIENCE FICTION,CRIME,ROMANCE,FAMILY,FANTASY,HORROR,ANIMATION,MYSTERY,HISTORY,WAR,MUSIC,WESTERN,TV MOVIE,DOCUMENTARY
0,0,3,4,15,14,7,2,1,1,6,0,2,1,0,0,0,0,0,0
1,1,2,0,7,6,3,0,0,0,4,0,1,0,0,0,0,0,0,0
2,1,1,7,7,8,1,0,0,1,0,0,1,0,1,1,0,0,0,0
3,0,4,0,0,0,0,0,1,5,3,0,2,0,0,0,0,0,0,0
4,3,1,1,0,0,0,0,1,0,1,2,0,0,0,0,1,1,0,0
5,1,2,0,2,3,1,0,0,1,0,0,1,0,1,0,0,4,0,0
6,0,1,0,4,5,0,0,3,2,4,0,2,0,1,0,0,0,0,0
7,14,6,10,7,4,5,0,6,4,0,1,3,5,0,0,0,0,3,0
8,7,2,5,6,4,4,1,1,0,1,1,0,0,1,0,0,0,0,0
9,2,1,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0


In [69]:
df2_afinity_propagation.head(50)

Unnamed: 0,Genre 1,Genre 2,Cluster,# Movies,Freq 1,Freq 2,% Most Relevant,% Second Most Relevant,Key Features
0,ACTION,ADVENTURE,1,15,15,14,100.0,93.333333,"[1980s, adolescent, aspiring, choice, guidance]"
68,THRILLER,HORROR,69,6,6,5,100.0,83.333333,"[changing, facility, cover, challenge, mist]"
198,ACTION,SCIENCE FICTION,199,1,1,1,100.0,100.0,"[home, asian, border, lt, familys]"
86,HORROR,COMEDY,87,4,4,1,100.0,25.0,"[communicate, month, dan, claim, haddonfield]"
84,THRILLER,DOCUMENTARY,85,1,1,0,100.0,0.0,"[lola, detective, amidst, castle, family]"
83,DRAMA,CRIME,84,4,4,4,100.0,100.0,"[collector, band, celeste, employ, dream]"
200,SCIENCE FICTION,ADVENTURE,201,4,4,3,100.0,75.0,"[hope, misery, malicious, comfort, aim]"
150,COMEDY,DOCUMENTARY,151,1,1,0,100.0,0.0,"[eye, laurie, joy, hong, mib]"
213,ACTION,SCIENCE FICTION,214,3,3,3,100.0,100.0,"[impulse, dickens, darcy, connect, interweave]"
71,ADVENTURE,SCIENCE FICTION,72,6,6,6,100.0,100.0,"[chi, circus, aid, log, air]"


In [320]:

df2.tail(50)

Unnamed: 0,Genre,Cluster,# Movies,Freq,% Most Relevant,Key Features
177,DRAMA,178,9,5,55.555556,"[protect, trio, sorcery, square, goo]"
58,ACTION,59,20,11,55.0,"[deranged, truth, bilbo, expose, gift]"
148,DRAMA,149,11,6,54.545455,"[mystery, create, accidentally, transformation..."
197,DRAMA,198,11,6,54.545455,"[shape, toward, serious, intergalactic, commit..."
72,ACTION,73,11,6,54.545455,"[dystopian, competitive, agent, league, clash]"
175,THRILLER,176,11,6,54.545455,"[profound, pawn, slip, nice, value]"
40,ADVENTURE,41,13,7,53.846154,"[conquer, civil, forced, whereabouts, cousin]"
55,ADVENTURE,56,15,8,53.333333,"[debt, gift, brand, superhero, castle]"
128,ADVENTURE,129,15,8,53.333333,"[letter, boyfriend, korean, pole, soul]"
76,ADVENTURE,77,17,9,52.941176,"[edward, blade, emily, unstable, gu]"
