In [25]:
# -*- coding: utf-8 -*-
"""
@author:Abhishek Mukherjee
Email: abhi0787@gmail.com
Email: amukher3@rocket.utoledo.edu

"""
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint


In [26]:
df=pd.read_csv("C:/Users/abhi0/Downloads/netflix_titles.csv")

def dataCleaninig(df):
    #Original show categories(non-unique):
    #A list to iterate over: 
    showCat=list(df['listed_in'])
    #Description for the shows
    showDes=list(df['description'])
    Idx=[]

    #Number of original categories(unique): 
    oriCat= list(df['listed_in'].unique())
    numOriCat=len(oriCat)
    
    #A histogram for the count of
    #different categories;
    #plt.hist(showCat,bins=numOriCat)

    #Counting the categories:
    print(df['listed_in'].value_counts())

    #Major categories to be included: 
    #Children/Family/Kids,Comedy,International,korean,British,Spanish,Sci-Fi/Fantasy,
    #Thriller,Dramas,Horror,Romantic,Musical/Music,Anime,Documentaries,Science and Nature,
    #Mysteries,crime,Cult,Talk shows,sports,Classic,LGBTQ,Independent,Teen,Docuseries,
    #Faith & Spirituality. 

    #Categories to be included: 
    categories=['Documentaries',
                'Comedy|Comedies',
                'International',
                'Independent',
                'Children|children|Childrens|childrens|Family|family|Kids|kids',
                'Korean|korean',
                'British|british',
                'Spanish|spanish',
                'Sci-Fi|sci-fi',
                'Fantasy|fantasy',
                'Thriller|thriller',
                'Dramas|dramas',
                'Horror|horror',
                'Romantic|romantic',
                'Musical|musical|Music|music',
                'Anime|anime',
                'Science and Nature|science and nature',
                'Mysteries|mysteries',
                'crime|Crime',
                'Cult|cult',
                'Talk shows',
                'sports|Sports',
                'Classic|classic',
                'LGBTQ',
                'Teen|teen',
                'Docuseries|docuseries',
                'Faith & Spirituality']

    labeledCategories=['Documentaries',
                       'Comedy',
                       'International',
                       'Independent',
                       'Children and Kids',
                       'Korean',
                       'British',
                       'Spanish',
                       'Sci-Fi',
                       'Fantasy',
                       'Thriller',
                       'Dramas',
                       'Horror',
                       'Romantic',
                       'Musical',
                       'Anime',
                       'Science and Nature',
                       'Mysteries',
                       'Crime',
                       'Cult',
                       'Talk shows',
                       'Sports',
                       'Classic',
                       'LGBTQ',
                       'Teen',
                       'Docuseries',
                       'Faith & Spirituality']

    for i in range(len(showCat)):
        Flag='False'  
        for j in range(len(categories)):
            if(re.search(categories[j],showCat[i])):
                Idx.append(j)
                showCat[i]=labeledCategories[j]
                Flag='True'
        if Flag!='True':
            showCat[i]='Misc. Category'
        
    ##plottting the histogram after reducing the labels:
    #plt.figure()        
    #plt.hist(showCat,bins=len(set(showCat)))    
    
    showDes=pd.Series(showDes)
    showCat=pd.Series(showCat)

    df_Final=pd.concat([showDes,showCat],axis=1)

    df_Final.columns=['Description','Category']
    
    
    return df_Final,labeledCategories

df_Final,labeledCategories=dataCleaninig(df)


Documentaries                                                       299
Stand-Up Comedy                                                     273
Dramas, International Movies                                        248
Dramas, Independent Movies, International Movies                    186
Comedies, Dramas, International Movies                              174
                                                                   ... 
Reality TV, Science & Nature TV                                       1
Children & Family Movies, Documentaries, Sports Movies                1
Action & Adventure, Cult Movies, Dramas                               1
Action & Adventure, Children & Family Movies, Independent Movies      1
Action & Adventure, Comedies, Music & Musicals                        1
Name: listed_in, Length: 461, dtype: int64


In [27]:


label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df_Final['Category'])

# one hot encode
encoded_labels = to_categorical(integer_encoded)
print(encoded_labels)
print(type(encoded_labels))


#Labels encoded finally in the labels
#due to intersection of words some labels
#would be taken in the other category. 
#Hence, we start with the category with 
#highest number. 


numCat_final=len(df_Final['Category'].unique())
print(numCat_final)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<class 'numpy.ndarray'>
22


In [28]:

docs = df_Final['Description']
labels = encoded_labels

X_train, X_test , y_train, y_test = train_test_split(docs, labels , test_size = 0.20)

vocab_size = 500

X_train = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_train]
X_test = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_test]


print(X_train[1])

[190, 173, 440, 183, 360, 308, 302, 119, 442, 492, 46, 355, 258, 119, 60, 175, 334, 401, 456, 471, 372, 289]


In [29]:
max_length = 100
X_train = pad_sequences(X_train, maxlen=max_length, padding='pre')
X_test = pad_sequences(X_test, maxlen=max_length, padding='pre')
print(len(labeledCategories))

27


In [30]:
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=max_length))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(numCat_final, activation='softmax'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

mdl_chk=\
ModelCheckpoint('C:/Users/abhi0/OneDrive/Documents/show_category._prediction_from_show_description/BestModel.h5',\
                monitor='val_loss',\
                verbose=1,\
                save_best_only=True,\
                save_weights_only=True)

print(model.summary())

model.fit(X_train, y_train, epochs=10,validation_split=0.3, callbacks=[mdl_chk],verbose=1)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 300)          150000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 30000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               15360512  
_________________________________________________________________
dense_8 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_9 (Dense)              (None, 22)                5654      
Total params: 15,647,494
Trainable params: 15,647,494
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3490 samples, validate on 1497 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.12656, saving model to C:/Users/abhi0/OneDrive/Documents/show_category._prediction_from_show_description/BestModel.h5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.12656
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.12656
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.12656
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.12656
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.12656
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.12656
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.12656
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.12656
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.12656


<keras.callbacks.callbacks.History at 0x14c2e9390c8>

'' was not found in history, as a file, url, nor in the user namespace.
