In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importación librerías
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split,  GridSearchCV

In [3]:
# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)

In [4]:
# Visualización datos de entrenamiento
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [5]:
#Se renombra la base de Training a Df para el desarrollo del proyect
df = dataTraining

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7895 entries, 3107 to 215
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    7895 non-null   int64  
 1   title   7895 non-null   object 
 2   plot    7895 non-null   object 
 3   genres  7895 non-null   object 
 4   rating  7895 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 370.1+ KB


In [7]:
# busqueda valorres perdidos o NA
df.isna().any()

year      False
title     False
plot      False
genres    False
rating    False
dtype: bool

In [8]:
df['genres'].value_counts()

['Drama']                                                  429
['Comedy']                                                 368
['Comedy', 'Drama', 'Romance']                             306
['Comedy', 'Romance']                                      291
['Comedy', 'Drama']                                        287
                                                          ... 
['Drama', 'Music', 'Mystery', 'Romance', 'Thriller']         1
['Biography', 'Western']                                     1
['Adventure', 'Drama', 'Horror', 'Mystery', 'Thriller']      1
['Action', 'Drama', 'Western']                               1
['Adventure', 'Comedy', 'Romance', 'War']                    1
Name: genres, Length: 1336, dtype: int64

In [9]:
df['rating'].describe()

count    7895.000000
mean        6.402812
std         1.078260
min         1.200000
25%         5.800000
50%         6.500000
75%         7.200000
max         9.300000
Name: rating, dtype: float64

In [10]:
df['genres'] = df['genres'].map(lambda x: eval(x))
mlb = MultiLabelBinarizer()
genres = df['genres']

labels = mlb.fit_transform(genres)
label_classes = mlb.classes_

In [11]:
labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])

In [12]:
label_classes

array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War',
       'Western'], dtype=object)

In [13]:
label_data = pd.DataFrame(labels, columns=label_classes)

In [14]:
label_data

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7890,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7891,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7892,0,1,0,0,1,0,0,0,0,1,...,1,0,0,1,0,0,0,0,0,0
7893,0,1,1,0,0,0,0,1,1,1,...,0,0,0,0,1,0,0,0,0,0


In [15]:
val = {}
for x in label_classes :
    val.update({x:label_data[x].value_counts()[1]})

In [16]:
sorted_val = sorted(val.items(), key=lambda kv: kv[1], reverse=True)

In [17]:
sorted_val

[('Drama', 3965),
 ('Comedy', 3046),
 ('Thriller', 2024),
 ('Romance', 1892),
 ('Crime', 1447),
 ('Action', 1303),
 ('Adventure', 1024),
 ('Horror', 954),
 ('Mystery', 759),
 ('Sci-Fi', 723),
 ('Fantasy', 707),
 ('Family', 682),
 ('Documentary', 419),
 ('Biography', 373),
 ('War', 348),
 ('Music', 341),
 ('History', 273),
 ('Musical', 271),
 ('Sport', 261),
 ('Animation', 260),
 ('Western', 237),
 ('Film-Noir', 168),
 ('Short', 92),
 ('News', 7)]

In [18]:
val_pd = pd.DataFrame.from_dict(sorted_val, orient='columns')
val_pd.rename(columns={0: "Genre", 1: "Count"}, inplace = True) 

In [19]:
val_pd

Unnamed: 0,Genre,Count
0,Drama,3965
1,Comedy,3046
2,Thriller,2024
3,Romance,1892
4,Crime,1447
5,Action,1303
6,Adventure,1024
7,Horror,954
8,Mystery,759
9,Sci-Fi,723


In [20]:
#Preprocesamiento
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\afval\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\afval\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
import spacy, re
nlp = spacy.load('es_core_news_sm')
wordnet_lemmatizer = WordNetLemmatizer()

def clean_plot(plot):      
    letters_only = re.sub("[^a-zA-Z]", " ", plot) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))   
    meaningful_words = [wordnet_lemmatizer.lemmatize(w) for w in words if not w in stops]   
    return( " ".join( meaningful_words ))

df['plot'] = df['plot'].apply(clean_plot)

In [22]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

df['plot'] = df['plot'].apply(lemmatize_text)

In [23]:
# Crear la matriz de características TF-IDF
vectorizer = TfidfVectorizer(max_features = 3250, stop_words ='english', smooth_idf=True, use_idf= True, sublinear_tf=True, norm='l1', analyzer='word')
X = vectorizer.fit_transform(df['plot'])
#vectorizer = TfidfVectorizer(max_features = 3250, stop_words ='english', smooth_idf=True, use_idf= True, sublinear_tf=True, norm='l1', analyzer='word')

In [24]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X, labels, test_size=0.33, random_state=42)

In [25]:
mlp = MLPClassifier(activation='relu', hidden_layer_sizes=(100), random_state=42, verbose = True, max_iter = 80)
mlp.fit(X_train, y_train_genres)

Iteration 1, loss = 16.39840712
Iteration 2, loss = 14.59531188
Iteration 3, loss = 11.58437961
Iteration 4, loss = 8.88365265
Iteration 5, loss = 7.60522088
Iteration 6, loss = 7.18321756
Iteration 7, loss = 7.03405167
Iteration 8, loss = 6.95392829
Iteration 9, loss = 6.89283086
Iteration 10, loss = 6.83380256
Iteration 11, loss = 6.76837413
Iteration 12, loss = 6.69558312
Iteration 13, loss = 6.61616207
Iteration 14, loss = 6.52854764
Iteration 15, loss = 6.43518732
Iteration 16, loss = 6.33898043
Iteration 17, loss = 6.23883096
Iteration 18, loss = 6.13807461
Iteration 19, loss = 6.03698520
Iteration 20, loss = 5.93819555
Iteration 21, loss = 5.84145516
Iteration 22, loss = 5.74715910
Iteration 23, loss = 5.65563719
Iteration 24, loss = 5.56595572
Iteration 25, loss = 5.48133823
Iteration 26, loss = 5.39749113
Iteration 27, loss = 5.31697607
Iteration 28, loss = 5.23830824
Iteration 29, loss = 5.16228842
Iteration 30, loss = 5.08895268
Iteration 31, loss = 5.01801722
Iteration 32, 

In [26]:
# Realizar predicciones en el conjunto de prueba
y_pred_genre = mlp.predict_proba(X_test)

# Impresión del desempeño del modelo
roc_auc_score(y_test_genres, y_pred_genre, average='macro')

0.8603788513042

In [27]:
# transformación variables predictoras X del conjunto de test
X_test_dtm = vectorizer.transform(dataTesting['plot'])

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

# Predicción del conjunto de test
y_pred_test_genres = mlp.predict_proba(X_test_dtm)

In [28]:
# Guardar predicciones en formato exigido en la competencia de kaggle
res = pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols)
res.to_csv(r'C:\Users\afval\Documents\MIAD_ML\MIAD_ML_NLP_2023\Semana 7\pred_genres_text_RF_8.csv', index_label='ID')
res.head()

Unnamed: 0,p_Action,p_Adventure,p_Animation,p_Biography,p_Comedy,p_Crime,p_Documentary,p_Drama,p_Family,p_Fantasy,...,p_Musical,p_Mystery,p_News,p_Romance,p_Sci-Fi,p_Short,p_Sport,p_Thriller,p_War,p_Western
1,0.002684,0.010425,0.001408,0.023297,0.632021,0.013638,0.001015,0.742912,0.005141,0.122748,...,0.048782,0.103871,0.00025,0.975682,0.004772,0.002478,0.002893,0.077308,0.003301,0.004201
4,0.010808,0.000905,0.000475,0.109648,0.623568,0.499465,0.031756,0.839369,0.000734,0.001118,...,0.011114,0.009057,0.000579,0.076232,0.00161,0.004622,0.018581,0.039676,0.006191,0.022862
5,0.036551,0.003965,0.000479,0.054188,0.026632,0.735602,0.014204,0.897442,0.001149,0.003491,...,0.004343,0.340881,0.000493,0.085832,0.011707,0.002417,0.005764,0.564132,0.01659,0.013099
6,0.056562,0.031414,0.002093,0.041003,0.035638,0.016105,0.011186,0.833266,0.005127,0.01612,...,0.006296,0.071854,0.000421,0.129151,0.091761,0.002101,0.007533,0.228564,0.090867,0.01779
7,0.013945,0.006209,0.002289,0.010404,0.166656,0.022013,0.021439,0.198405,0.003122,0.032866,...,0.004515,0.100033,0.000337,0.034585,0.42207,0.003923,0.00115,0.16426,0.003127,0.006298
