In [1]:
import pandas as pd
import numpy as np

import nltk
import re
import csv

import warnings
warnings.filterwarnings('ignore', message=r'Label not .* is present in all training examples.')

pd.set_option('display.max_colwidth', 300)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import f1_score, accuracy_score

In [3]:
data = pd.read_csv('../input/netflix-shows/netflix_titles_nov_2019.csv')
data.head(1)

data = data[['title', 'description', 'listed_in']]

In [4]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    no_stpwrd_txt = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stpwrd_txt)

data['description'] = data['description'].apply(lambda x: remove_stopwords(x))
data.head(1)

Unnamed: 0,title,description,listed_in
0,Chocolate,"Brought together meaningful meals past present, doctor chef reacquainted begin working hospice ward.","International TV Shows, Korean TV Shows, Romantic TV Shows"


In [5]:
data.listed_in = data.listed_in.str.split(',')
data.head()

Unnamed: 0,title,description,listed_in
0,Chocolate,"Brought together meaningful meals past present, doctor chef reacquainted begin working hospice ward.","[International TV Shows, Korean TV Shows, Romantic TV Shows]"
1,Guatemala: Heart of the Mayan World,"From Sierra de las Minas Esquipulas, explore Guatemala's cultural geological wealth, including ancient Mayan cities natural wonders.","[Documentaries, International Movies]"
2,The Zoya Factor,"A goofy copywriter unwittingly convinces Indian cricket team she’s lucky mascot, dismay superstition-shunning captain.","[Comedies, Dramas, International Movies]"
3,Atlantics,"Arranged marry rich man, young Ada crushed true love goes missing sea migration attempt – miracle reunites them.","[Dramas, Independent Movies, International Movies]"
4,Chip and Potato,"Lovable pug Chip starts kindergarten, makes new friends tries new things – little help Potato, secret mouse pal.",[Kids' TV]


In [6]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data['listed_in'])

y = multilabel_binarizer.transform(data['listed_in'])

In [7]:
x_train, x_val, ytrain, yval = train_test_split(data['description'],
                                                y, test_size=0.2)

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, max_features=5000)
xtrain = tfidf_vectorizer.fit_transform(x_train)
xval = tfidf_vectorizer.transform(x_val)

In [9]:
lr = DecisionTreeClassifier()
classifier = OneVsRestClassifier(lr)
classifier.fit(xtrain, ytrain)

OneVsRestClassifier(estimator=DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort=False,
                                                     random_state=None,
                                                     splitter='best'),
                    n_jobs=None)

In [10]:
y_pred = classifier.predict(xval)
f1_score(yval, y_pred, average="micro"), accuracy_score(yval, y_pred)

(0.2201352366641623, 0.023972602739726026)

In [11]:
pred_prob = classifier.predict_proba(xval)
t = 0.3
predp = (pred_prob >= t).astype(int)
f1_score(yval, predp, average="micro"), accuracy_score(yval, predp)

(0.2201352366641623, 0.023972602739726026)

In [12]:
def predict(m):
    m = remove_stopwords(m)
    m_vec = tfidf_vectorizer.transform([m])
    pred_prob = classifier.predict_proba(m_vec)
    t = 0.3
    predp = (pred_prob >= t).astype(int)
    #m_pred = classifier.predict(m_vec)
    return multilabel_binarizer.inverse_transform(predp)

In [13]:
title = []
actual_list = []
predicted_list = []
for i in range(10):
    k = x_val.sample(1).index[0]
    title.append(data['title'][k])
    predicted_list.append(predict(x_val[k]))
    actual_list.append(data['listed_in'][k])

dicts = {'Title': title,'Actual-Show-Category':actual_list,
        'Predicted Show Category': predicted_list}
df = pd.DataFrame(data=dicts)
df.head(10)

Unnamed: 0,Title,Actual-Show-Category,Predicted Show Category
0,Nature: Animals With Cameras,"[Docuseries, Science & Nature TV]",[()]
1,Albion: The Enchanted Stallion,[Children & Family Movies],"[( International Movies, Music & Musicals, Action & Adventure)]"
2,Wadi,"[International TV Shows, TV Dramas]","[( International Movies, Romantic Movies, TV Comedies)]"
3,Defiance,"[Action & Adventure, Dramas]","[( Dramas,)]"
4,The Good Catholic,"[Comedies, Dramas, Romantic Movies]","[( Dramas, International TV Shows, TV Comedies, TV Dramas, Comedies, Crime TV Shows, International TV Shows)]"
5,The Republic of Imbaba,"[Dramas, International Movies]","[( International Movies, Dramas)]"
6,The Judgement,"[Crime TV Shows, International TV Shows, TV Dramas]","[(Dramas,)]"
7,JoJo's Bizarre Adventure,"[Anime Series, International TV Shows]","[( International Movies, Thrillers, Dramas, International TV Shows)]"
8,One by Two,"[Comedies, Dramas, International Movies]","[(Comedies,)]"
9,Meghnadbodh Rohoshyo,"[Dramas, International Movies]","[( Comedies,)]"
