In [44]:
import pandas as pd
import nltk
import json
import string
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Antonio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pd.set_option('mode.chained_assignment',None)

## Carregando o Dataset

In [4]:
data_set = pd.read_csv("../../dados/movies_metadata.csv")

  data_set = pd.read_csv("../../dados/movies_metadata.csv")


In [5]:
data_set = data_set[["genres", "overview", "original_title", "original_language"]]

In [6]:
movies = data_set.query("original_language == 'en'")

## Acessando o apenas o primeiro genero

In [7]:
def extrair_primeiro_genero( texto ):
    # print(texto)
    txt = texto.replace("'", '"')
    lista = json.loads(txt)
    if len(lista):
        return lista[0]['name']
    else:
        return "Unknown"

In [8]:
movies["genero"] = movies["genres"].apply(extrair_primeiro_genero)

In [9]:
lista_generos = list(movies["genero"].unique())

In [10]:
def extrair_genero_id( genero ):
    global lista_generos
    num_genero = 11
    try:
        num_genero = lista_generos.index(genero)
    except: 
        pass
    return num_genero

In [11]:
movies["genero_id"] = movies["genero"].apply(extrair_genero_id)

## Limpar o texto de descrição do filme (overview)

### Remover os caracteres com \n \r

In [12]:
movies["overview"].dropna(inplace=True)

In [13]:
trans = str.maketrans('\n\r\t', '   ')
movies["overview_no_enter"] = movies["overview"].str.translate(trans)

In [14]:
movies

Unnamed: 0,genres,overview,original_title,original_language,genero,genero_id,overview_no_enter
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",Toy Story,en,Animation,0,"Led by Woody, Andy's toys live happily in his ..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Jumanji,en,Adventure,1,When siblings Judy and Peter discover an encha...
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Grumpier Old Men,en,Romance,2,A family wedding reignites the ancient feud be...
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,en,Comedy,3,"Cheated on, mistreated and stepped on, the wom..."
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Father of the Bride Part II,en,Comedy,3,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...
45459,"[{'id': 878, 'name': 'Science Fiction'}]",It's the year 3000 AD. The world's most danger...,Caged Heat 3000,en,Science Fiction,9,It's the year 3000 AD. The world's most danger...
45460,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...","Yet another version of the classic epic, with ...",Robin Hood,en,Drama,7,"Yet another version of the classic epic, with ..."
45463,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","When one of her hits goes wrong, a professiona...",Betrayal,en,Action,4,"When one of her hits goes wrong, a professiona..."
45464,[],"In a small town live two brothers, one a minis...",Satana likuyushchiy,en,Unknown,11,"In a small town live two brothers, one a minis..."


### Remover contrações textuais do inglês

In [15]:
contraction_dict = {
                    "aren't": "are not",
                    "can't": "can not",
                    "could've": "could have",
                    "couldn't": "could not",
                    "daren't": "dare not",
                    "didn't": "did not",
                    "doesn't": "does not",
                    "don't": "do not",
                    "hadn't": "had not",
                    "hasn't": "has not",
                    "haven't": "have not",
                    "he's": "he is",
                    "how'd": "how had",
                    "how're": "how are",
                    "how's": "how is",
                    "how've": "how have",
                    "i'd": "i had",
                    "i'm": "i am",
                    "i've": "i have",
                    "isn't": "is+ not",
                    "it's": "it is",
                    "might've": "might have",
                    "mightn't": "might not",
                    "must've": "must have",
                    "mustn't": "must not",
                    "needn't": "need not",
                    "oughtn't": "ought not",
                    "shan't": "shall not",
                    "she'd": "she had",
                    "she's": "she is",
                    "should've": "should have",
                    "shouldn't": "should not",
                    "that'd": "that had",
                    "thats's": "that is",
                    "there'd": "there had",
                    "there's": "there is",
                    "they'd": "they had",
                    "they're": "you are",
                    "they've": "they have",
                    "wasn't": "was+ not",
                    "we'd": "we had",
                    "we're": "we are",
                    "we've": "we have",
                    "weren't": "were not",
                    "what'd": "what had",
                    "what're": "what are",
                    "what's": "what is",
                    "what've": "what have",
                    "when'd": "when had",
                    "when're": "when are",
                    "when's": "when is",
                    "when've": "when have",
                    "where'd": "where had",
                    "where're": "where are",
                    "where's": "where is",
                    "where've": "where have",
                    "who'd": "who had",
                    "who're": "who are",
                    "who's": "who is",
                    "who've": "who have",
                    "why'd": "why had",
                    "why're": "why are",
                    "why's": "why is",
                    "why've": "why have",
                    "would've": "would have",
                    "wouldn't": "would not",
                    "you're": "you are",
                    "you've": "you have",
                    "'cause": "because", 
                    "ain't": "is not", 
                    "aren't": "are not",
                    "can't": "cannot", 
                    "could've": "could have",
                    "he's": "he is",
                    "how'll": "how will",
                    "i'll": "i will",
                    "it'll": "it will",
                    "it's": "it is", 
                    "she'll": "she will",
                    "she's": "she is",
                    "that'll": "that will",
                    "there'll": "there will",
                    "they'll": "they will",
                    "they're": "they are",
                    "we'll": "we will",
                    "we're": "we are",
                    "what'll": "what will",
                    "when'll": "when will",
                    "where'll": "where will",
                    "who'll": "who will",
                    "yo're": "you are",
                    "you'll": "you will"
}

In [16]:
def remove_contractions( text ):
    # print(text)
    # print(type(text))
    if (type(text) == str):
        text = text.lower()
        # print(text)
        for chave in list(contraction_dict.keys()):
            valor = contraction_dict[chave]
            text = text.replace(chave, valor)
        text = text.replace("\'s", " have")        
        return text
    else:
        return ""

In [17]:
movies["overview_no_contraction"] = movies["overview_no_enter"].apply(remove_contractions)

In [18]:
movies

Unnamed: 0,genres,overview,original_title,original_language,genero,genero_id,overview_no_enter,overview_no_contraction
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",Toy Story,en,Animation,0,"Led by Woody, Andy's toys live happily in his ...","led by woody, andy have toys live happily in h..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Jumanji,en,Adventure,1,When siblings Judy and Peter discover an encha...,when siblings judy and peter discover an encha...
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Grumpier Old Men,en,Romance,2,A family wedding reignites the ancient feud be...,a family wedding reignites the ancient feud be...
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,en,Comedy,3,"Cheated on, mistreated and stepped on, the wom...","cheated on, mistreated and stepped on, the wom..."
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Father of the Bride Part II,en,Comedy,3,Just when George Banks has recovered from his ...,just when george banks has recovered from his ...
...,...,...,...,...,...,...,...,...
45459,"[{'id': 878, 'name': 'Science Fiction'}]",It's the year 3000 AD. The world's most danger...,Caged Heat 3000,en,Science Fiction,9,It's the year 3000 AD. The world's most danger...,it is the year 3000 ad. the world have most da...
45460,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...","Yet another version of the classic epic, with ...",Robin Hood,en,Drama,7,"Yet another version of the classic epic, with ...","yet another version of the classic epic, with ..."
45463,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","When one of her hits goes wrong, a professiona...",Betrayal,en,Action,4,"When one of her hits goes wrong, a professiona...","when one of her hits goes wrong, a professiona..."
45464,[],"In a small town live two brothers, one a minis...",Satana likuyushchiy,en,Unknown,11,"In a small town live two brothers, one a minis...","in a small town live two brothers, one a minis..."


### Remover as pontuções

In [19]:
translator = str.maketrans('', '', string.punctuation)
movies["overview_no_punctuation"]  = movies["overview_no_contraction"] .str.translate(translator)

In [20]:
movies

Unnamed: 0,genres,overview,original_title,original_language,genero,genero_id,overview_no_enter,overview_no_contraction,overview_no_punctuation
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",Toy Story,en,Animation,0,"Led by Woody, Andy's toys live happily in his ...","led by woody, andy have toys live happily in h...",led by woody andy have toys live happily in hi...
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Jumanji,en,Adventure,1,When siblings Judy and Peter discover an encha...,when siblings judy and peter discover an encha...,when siblings judy and peter discover an encha...
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Grumpier Old Men,en,Romance,2,A family wedding reignites the ancient feud be...,a family wedding reignites the ancient feud be...,a family wedding reignites the ancient feud be...
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,en,Comedy,3,"Cheated on, mistreated and stepped on, the wom...","cheated on, mistreated and stepped on, the wom...",cheated on mistreated and stepped on the women...
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Father of the Bride Part II,en,Comedy,3,Just when George Banks has recovered from his ...,just when george banks has recovered from his ...,just when george banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...
45459,"[{'id': 878, 'name': 'Science Fiction'}]",It's the year 3000 AD. The world's most danger...,Caged Heat 3000,en,Science Fiction,9,It's the year 3000 AD. The world's most danger...,it is the year 3000 ad. the world have most da...,it is the year 3000 ad the world have most dan...
45460,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...","Yet another version of the classic epic, with ...",Robin Hood,en,Drama,7,"Yet another version of the classic epic, with ...","yet another version of the classic epic, with ...",yet another version of the classic epic with e...
45463,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","When one of her hits goes wrong, a professiona...",Betrayal,en,Action,4,"When one of her hits goes wrong, a professiona...","when one of her hits goes wrong, a professiona...",when one of her hits goes wrong a professional...
45464,[],"In a small town live two brothers, one a minis...",Satana likuyushchiy,en,Unknown,11,"In a small town live two brothers, one a minis...","in a small town live two brothers, one a minis...",in a small town live two brothers one a minist...


In [21]:
stopwords = nltk.corpus.stopwords.words('english')

In [22]:
def remove_stop_words( text ):
    nova_lista = []
    palavras = text.split(" ")
    for palavra in palavras:
        if palavra not in stopwords:
            nova_lista.append(palavra)
    return " ".join(nova_lista)

In [23]:
movies["overview_no_punctuation"][0]

'led by woody andy have toys live happily in his room until andy have birthday brings buzz lightyear onto the scene afraid of losing his place in andy have heart woody plots against buzz but when circumstances separate buzz and woody from their owner the duo eventually learns to put aside their differences'

In [24]:
movies["overview_no_stops"] = movies["overview_no_punctuation"].apply(remove_stop_words)

In [25]:
def stemmer_func( texto ):
    # print("Texto Recebido: (Tipo) ", type(texto))
    if isinstance(texto, str):
        lista_palavras = texto.split(" ")
        # print("Lista Palavras: ", lista_palavras)
        nova_lista = []
        for palavra in lista_palavras:
            stemmed = stemmer.stem( palavra )
            nova_lista.append(stemmed)
        return " ".join(nova_lista)
    else:
        return ""

In [26]:
stemmer = nltk.PorterStemmer()
movies["overview_stemmed"] = movies["overview_no_stops"].apply(stemmer_func)

In [31]:
def pre_processa( texto ):
    novo_texto = texto.lower().split(" ")
    return str(novo_texto)

In [32]:
textos = movies["overview_stemmed"].apply(pre_processa)
textos

0        ['led', 'woodi', 'andi', 'toy', 'live', 'happi...
1        ['sibl', 'judi', 'peter', 'discov', 'enchant',...
2        ['famili', 'wed', 'reignit', 'ancient', 'feud'...
3        ['cheat', 'mistreat', 'step', 'women', 'hold',...
4        ['georg', 'bank', 'recov', 'daughter', 'wed', ...
                               ...                        
45459    ['year', '3000', 'ad', 'world', 'danger', 'wom...
45460    ['yet', 'anoth', 'version', 'classic', 'epic',...
45463    ['one', 'hit', 'goe', 'wrong', 'profession', '...
45464    ['small', 'town', 'live', 'two', 'brother', 'o...
45465    ['50', 'year', 'decriminalis', 'homosexu', 'uk...
Name: overview_stemmed, Length: 32269, dtype: object

In [39]:
train_X, test_X, train_Y, test_Y = train_test_split(textos, movies["genero"], test_size=0.25, random_state=50)

In [40]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(train_Y)
Test_Y = Encoder.fit_transform(test_Y)

In [35]:
vetorizador = TfidfVectorizer(max_features=5000)
vetorizador.fit(textos)
x_train_bow = vetorizador.transform(train_X)
x_test_bow = vetorizador.transform(test_X)

In [36]:
# vetorizador = TfidfVectorizer()
# bow = vetorizador.fit_transform(textos)
# vocabulario = vetorizador.get_feature_names_out()
# bow_matrix = pd.DataFrame.sparse.from_spmatrix(bow, columns=vetorizador.get_feature_names_out())

In [37]:
vetorizador.vocabulary_

{'led': 2632,
 'woodi': 4942,
 'andi': 272,
 'toy': 4571,
 'live': 2692,
 'happili': 2074,
 'room': 3852,
 'birthday': 543,
 'bring': 638,
 'buzz': 693,
 'onto': 3204,
 'scene': 3931,
 'afraid': 178,
 'lose': 2722,
 'place': 3392,
 'heart': 2120,
 'plot': 3411,
 'circumst': 865,
 'separ': 3999,
 'owner': 3256,
 'duo': 1442,
 'eventu': 1607,
 'learn': 2629,
 'put': 3586,
 'asid': 359,
 'differ': 1301,
 'sibl': 4076,
 'judi': 2502,
 'peter': 3356,
 'discov': 1327,
 'enchant': 1534,
 'board': 573,
 'game': 1903,
 'open': 3206,
 'door': 1385,
 'magic': 2768,
 'world': 4948,
 'unwittingli': 4719,
 'invit': 2401,
 'alan': 205,
 'adult': 163,
 'trap': 4596,
 'insid': 2351,
 '26': 94,
 'year': 4978,
 'hope': 2204,
 'freedom': 1865,
 'finish': 1759,
 'prove': 3556,
 'riski': 3814,
 'three': 4512,
 'find': 1756,
 'run': 3871,
 'giant': 1939,
 'evil': 1618,
 'monkey': 2987,
 'terrifi': 4472,
 'creatur': 1100,
 'famili': 1689,
 'wed': 4872,
 'ancient': 269,
 'feud': 1735,
 'neighbor': 3081,
 'fish

In [41]:
print(x_train_bow)

  (0, 4965)	0.10512108856916083
  (0, 4925)	0.10274733408881594
  (0, 4874)	0.11507076667457654
  (0, 4772)	0.10213174127147731
  (0, 4680)	0.14375015480198933
  (0, 4627)	0.08686678416911767
  (0, 4530)	0.07030753800009272
  (0, 4517)	0.24144160788486246
  (0, 4514)	0.11376424022318302
  (0, 4500)	0.0862476730797607
  (0, 4469)	0.13565975726040685
  (0, 4423)	0.14882574116821012
  (0, 4386)	0.1322965892294699
  (0, 4372)	0.12789760301384373
  (0, 4345)	0.11050213765745678
  (0, 4298)	0.20094743786217448
  (0, 4245)	0.1531425804643667
  (0, 4186)	0.08146429022074346
  (0, 4126)	0.1441258087502679
  (0, 4110)	0.11043554354533942
  (0, 4103)	0.11899043822448764
  (0, 4079)	0.10166143904920417
  (0, 4058)	0.11246641761041065
  (0, 3975)	0.08777235840609802
  (0, 3960)	0.09131183826855627
  :	:
  (24199, 2023)	0.09570169922981667
  (24199, 1963)	0.13652706840782736
  (24199, 1765)	0.09100686118661386
  (24199, 1716)	0.11149344629269466
  (24199, 1553)	0.1297215272671445
  (24199, 1488)	0.1

In [45]:
# fit the training dataset on the NB classifier
Naive = MultinomialNB()
Naive.fit(x_train_bow,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(x_test_bow)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  42.03024293505206


In [None]:
# model = LinearSVC()
# model = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# model.fit(x_train_bow, Train_Y)

In [None]:
# acuracia = model.score(Test_Y, test_Y)

In [None]:
# acuracia

In [None]:
# pd.concat([pd.Series(movies["genero"].unique()), pd.Series(movies["genero_id"].unique())], axis=1)

In [None]:
# text = "hounted house in the mid night ghost and souls"

In [None]:
# vetorizador = TfidfVectorizer(lowercase=False, max_features=200, vocabulary=vocabulario)
# bow = vetorizador.fit_transform([text])
# bow_matrix = pd.DataFrame.sparse.from_spmatrix(bow, columns=vetorizador.get_feature_names_out())

In [None]:
# model.predict(bow_matrix)

In [None]:
# movies["genero_category"] = movies["genero"].factorize()[0]


In [None]:
# classes = pd.DataFrame(movies["genero"].unique()).values
# classes

In [None]:
# movies["genero"].merge(movies[])

In [None]:
# classes_id = pd.DataFrame(movies[["genero"], ["genero_category"]].unique()).values
# classes_id

In [None]:
# classes_grouped = movies.groupby('genero').count().sort_values(ascending=True, by="genero_category")

In [None]:
# classes_grouped.plot.barh(y='genero_category')