In [1]:
import pandas as pd
df=pd.read_csv('Overview_genres.csv')

In [2]:
df.head()

Unnamed: 0,imdb_id,overview,tagline,title,Genres
0,tt0083866,After a gentle alien becomes stranded on Earth...,He is afraid. He is alone. He is three million...,E.T. the Extra-Terrestrial,"Family,Sci-Fi"
1,tt0116629,"On July 2, a giant alien mothership enters orb...",Earth. Take a good look. It might be your last.,Independence Day,"Action,Adventure,Sci-Fi"
2,tt0133093,"Set in the 22nd century, The Matrix tells the ...",Welcome to the Real World.,The Matrix,"Action,Sci-Fi"
3,tt0234215,Six months after the events depicted in The Ma...,Free your mind.,The Matrix Reloaded,"Action,Sci-Fi"
4,tt0242653,The human city of Zion defends itself against ...,Everything that has a beginning has an end.,The Matrix Revolutions,"Action,Sci-Fi"


In [3]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [4]:
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
df['overview'] = [entry.lower() for entry in df['overview']]
df['Genres'] = [entry.lower() for entry in df['Genres']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
df['overview']= [word_tokenize(entry) for entry in df['overview']]
df['Genres']= [word_tokenize(entry) for entry in df['Genres']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['overview']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [5]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'],df['Genres'],test_size=0.25)

In [6]:
mlb = MultiLabelBinarizer()
Train_Y = pd.DataFrame(mlb.fit_transform(Train_Y))
Test_Y = pd.DataFrame(mlb.fit_transform(Test_Y))

In [7]:
Train_Y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
7,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

Train_X_Tfidf

<7826x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 174966 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [10]:
print(Train_Y.shape)
print(Test_Y.shape)

(7826, 23)
(2609, 23)


In [11]:
#prediction=OneVsRestClassifier(MultinomialNB(
                    #fit_prior=True, class_prior=None)).fit(Train_X_Tfidf, Train_Y).predict(Test_X_Tfidf)
prediction = OneVsRestClassifier(LinearSVC(random_state=3)).fit(Train_X_Tfidf, Train_Y).predict(Test_X_Tfidf)
#prediction.score(Test_X_Tfidf,Test_Y)
print(prediction.shape)

(2609, 23)


In [12]:
def custom_score(prediction, Test_Y):
    B=prediction+Test_Y
    
    Global = np.count_nonzero(B == 2)/np.count_nonzero(Test_Y == 1)*100
    
    a = np.zeros([1,B.shape[0]])
    b = np.zeros([1,B.shape[0]])
    
    for i in range(len(B)):
        a[0,i] = np.count_nonzero(B[i] == 2)
        b[0,i] = np.count_nonzero(Test_Y[i] == 1)
        
    return [Global, 100*np.mean(np.divide(a,b))]

In [13]:
custom_score(prediction, Test_Y.values)

[57.8538283062645, 59.256420084323494]