In [1]:
import numpy as np 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
stemmer = PorterStemmer()
tfidf_vectorizer = TfidfVectorizer()

In [3]:
dataset = pd.read_csv("D:\Datasets\movies_genre.csv\movies_genre.csv")

In [4]:
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,genre_ids,overview,title,Western,Romance,Horror,TV Movie,Fantasy,Drama,...,Music,History,Thriller,Comedy,Crime,Mystery,Action,Animation,Family,Adventure
0,0,"['Drama', 'Crime']","Spanning the years 1945 to 1955, a chronicle o...",The Godfather,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,"['Action', 'Adventure', 'Animation', 'Science ...","After reuniting with Gwen Stacy, Brooklyn’s fu...",Spider-Man: Across the Spider-Verse,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
2,2,"['Drama', 'Crime']",Framed in the 1940s for the double murder of h...,The Shawshank Redemption,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,3,"['Drama', 'Crime']",In the continuing saga of the Corleone crime f...,The Godfather Part II,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,4,"['Comedy', 'Drama', 'Romance']","Raj is a rich, carefree, happy-go-lucky second...",Dilwale Dulhania Le Jayenge,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [5]:
print(dataset['genre_ids'].unique().sum())

['Drama', 'Crime']['Action', 'Adventure', 'Animation', 'Science Fiction']['Comedy', 'Drama', 'Romance']['Drama', 'History', 'War']['Animation', 'Family', 'Fantasy']['Drama']['Romance', 'Animation', 'Drama']['Comedy', 'Thriller', 'Drama']['Fantasy', 'Drama', 'Crime']['Drama', 'Action', 'Crime', 'Thriller']['Comedy', 'Fantasy']['Thriller', 'Crime']['Western']['Adventure', 'Fantasy', 'Action']['Action', 'Adventure', 'Animation', 'Drama']['Drama', 'Romance']['Action', 'Drama']['Romance', 'Animation']['Romance']['Comedy', 'Drama']['Animation', 'Drama', 'War']['Animation', 'Family', 'Adventure', 'Fantasy']['Horror', 'Drama', 'Thriller']['Animation', 'Fantasy', 'Action']['Drama', 'Thriller', 'Comedy']['Family', 'Drama']['Romance', 'Drama']['Romance', 'Comedy']['Action', 'Drama', 'History']['Animation', 'Comedy', 'Romance']['Adventure', 'Drama', 'Science Fiction']['Fantasy', 'Animation', 'Adventure']['Animation', 'Drama']['Adventure', 'Action', 'Science Fiction']['Romance', 'Animation', 'Fanta

In [6]:
for i in dataset['genre_ids']:
    print(i)

['Drama', 'Crime']
['Action', 'Adventure', 'Animation', 'Science Fiction']
['Drama', 'Crime']
['Drama', 'Crime']
['Comedy', 'Drama', 'Romance']
['Drama', 'History', 'War']
['Animation', 'Family', 'Fantasy']
['Drama']
['Romance', 'Animation', 'Drama']
['Comedy', 'Thriller', 'Drama']
['Fantasy', 'Drama', 'Crime']
['Drama', 'Action', 'Crime', 'Thriller']
['Comedy', 'Fantasy']
['Thriller', 'Crime']
['Comedy', 'Drama', 'Romance']
['Western']
['Adventure', 'Fantasy', 'Action']
['Action', 'Adventure', 'Animation', 'Drama']
['Drama', 'Crime']
['Drama', 'Romance']
['Action', 'Drama']
['Romance', 'Animation']
['Romance']
['Comedy', 'Drama']
['Animation', 'Drama', 'War']
['Animation', 'Family', 'Adventure', 'Fantasy']
['Horror', 'Drama', 'Thriller']
['Animation', 'Fantasy', 'Action']
['Romance']
['Drama', 'Crime']
['Drama', 'Thriller', 'Comedy']
['Family', 'Drama']
['Romance', 'Drama']
['Drama', 'Crime']
['Drama']
['Romance', 'Comedy']
['Action', 'Drama', 'History']
['Animation', 'Comedy', 'Roman

In [7]:
genre = []

for i in dataset['genre_ids']:
    i = re.sub(r'[\[\]\'\']','',i)
    i = i.split(', ')
    genre.append(i)

In [8]:
dataset_1 = pd.DataFrame(columns=['title_overview','Western','Romance','Horror','TV Movie','Fantasy','Drama','Science Fiction','War','Music','History','Thriller','Comedy','Crime','Mystery','Action','Animation','Family','Adventure'])

In [10]:
movie_tits = []

for i,j in zip(dataset['title'],dataset['overview']):
    text = str(i) +'|'+ str(j)
    text = re.sub('[^a-zA-Z]'," ",text)
    text = text.split(' ')
    text = [txt for txt in text if txt not in set(stopwords.words('english')) and txt !='']
    # text = [stemmer.stem(txt) for txt in text]
    text = ' '.join(text)
    text = text.lower()
    movie_tits.append(text)

In [11]:
for i,movie in enumerate(movie_tits):
    dataset_1.loc[i,'title_overview'] = movie

for i, gen in enumerate(genre):
    dataset_1.loc[i, gen] = 1

In [12]:
dataset_1.drop(columns=[''],inplace=True)

In [13]:
dataset_1.fillna(0, inplace=True)

In [14]:
dataset_1.head()

Unnamed: 0,title_overview,Western,Romance,Horror,TV Movie,Fantasy,Drama,Science Fiction,War,Music,History,Thriller,Comedy,Crime,Mystery,Action,Animation,Family,Adventure
0,the godfather spanning years chronicle fiction...,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
1,spider man across spider verse after reuniting...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1
2,the shawshank redemption framed double murder ...,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,the godfather part ii in continuing saga corle...,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,dilwale dulhania le jayenge raj rich carefree ...,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [15]:
dataset_1.to_csv('D:\Datasets\movies_genre_1.csv')