In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importación librerías
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
# Carga de datos de archivo .csv
dataTraining = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)

In [4]:
# Visualización datos de entrenamiento
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [5]:
#Se renombra la base de Training a Df para el desarrollo del proyect
df = dataTraining

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7895 entries, 3107 to 215
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    7895 non-null   int64  
 1   title   7895 non-null   object 
 2   plot    7895 non-null   object 
 3   genres  7895 non-null   object 
 4   rating  7895 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 370.1+ KB


In [7]:
# busqueda valorres perdidos o NA
df.isna().any()

year      False
title     False
plot      False
genres    False
rating    False
dtype: bool

In [8]:
df['genres'].value_counts()

['Drama']                                                               429
['Comedy']                                                              368
['Comedy', 'Drama', 'Romance']                                          306
['Comedy', 'Romance']                                                   291
['Comedy', 'Drama']                                                     287
                                                                       ... 
['Animation', 'Adventure', 'Comedy', 'Family', 'Fantasy', 'Musical']      1
['Comedy', 'Family', 'Music', 'Musical']                                  1
['Adventure', 'Thriller']                                                 1
['Comedy', 'Fantasy', 'Drama']                                            1
['Animation', 'Adventure', 'Drama', 'Family', 'Fantasy', 'Sci-Fi']        1
Name: genres, Length: 1336, dtype: int64

In [9]:
df['rating'].describe()

count    7895.000000
mean        6.402812
std         1.078260
min         1.200000
25%         5.800000
50%         6.500000
75%         7.200000
max         9.300000
Name: rating, dtype: float64

In [10]:
df['genres'] = df['genres'].map(lambda x: eval(x))
mlb = MultiLabelBinarizer()
genres = df['genres']

labels = mlb.fit_transform(genres)
label_classes = mlb.classes_

In [11]:
labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])

In [12]:
label_classes

array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War',
       'Western'], dtype=object)

In [13]:
label_data = pd.DataFrame(labels, columns=label_classes)

In [14]:
label_data

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7890,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7891,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7892,0,1,0,0,1,0,0,0,0,1,...,1,0,0,1,0,0,0,0,0,0
7893,0,1,1,0,0,0,0,1,1,1,...,0,0,0,0,1,0,0,0,0,0


In [15]:
val = {}
for x in label_classes :
    val.update({x:label_data[x].value_counts()[1]})

In [16]:
sorted_val = sorted(val.items(), key=lambda kv: kv[1], reverse=True)

In [17]:
sorted_val

[('Drama', 3965),
 ('Comedy', 3046),
 ('Thriller', 2024),
 ('Romance', 1892),
 ('Crime', 1447),
 ('Action', 1303),
 ('Adventure', 1024),
 ('Horror', 954),
 ('Mystery', 759),
 ('Sci-Fi', 723),
 ('Fantasy', 707),
 ('Family', 682),
 ('Documentary', 419),
 ('Biography', 373),
 ('War', 348),
 ('Music', 341),
 ('History', 273),
 ('Musical', 271),
 ('Sport', 261),
 ('Animation', 260),
 ('Western', 237),
 ('Film-Noir', 168),
 ('Short', 92),
 ('News', 7)]

In [18]:
val_pd = pd.DataFrame.from_dict(sorted_val, orient='columns')
val_pd.rename(columns={0: "Genre", 1: "Count"}, inplace = True) 

In [19]:
val_pd

Unnamed: 0,Genre,Count
0,Drama,3965
1,Comedy,3046
2,Thriller,2024
3,Romance,1892
4,Crime,1447
5,Action,1303
6,Adventure,1024
7,Horror,954
8,Mystery,759
9,Sci-Fi,723


In [20]:
#Preprocesamiento
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\afval\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\afval\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
import spacy, re
nlp = spacy.load('es_core_news_sm')
wordnet_lemmatizer = WordNetLemmatizer()

def clean_plot(plot):      
    letters_only = re.sub("[^a-zA-Z]", " ", plot) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))   
    meaningful_words = [wordnet_lemmatizer.lemmatize(w) for w in words if not w in stops]   
    return( " ".join( meaningful_words ))

df['plot'] = df['plot'].apply(clean_plot)

In [27]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

df['plot'] = df['plot'].apply(lemmatize_text)

In [57]:
# Crear la matriz de características TF-IDF
vectorizer = TfidfVectorizer(max_features = 3250, stop_words ='english', smooth_idf=True, use_idf= True, sublinear_tf=True, norm='l1', analyzer='word')
X = vectorizer.fit_transform(df['plot'])
#vectorizer = TfidfVectorizer(max_features = 3250, stop_words ='english', smooth_idf=True, use_idf= True, sublinear_tf=True, norm='l1', analyzer='word')

In [58]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X, labels, test_size=0.33, random_state=42)

In [59]:
mlp = MLPClassifier(activation='relu', hidden_layer_sizes=(100), random_state=42, verbose = True, max_iter = 80)
mlp.fit(X_train, y_train_genres)

Iteration 1, loss = 16.39921847
Iteration 2, loss = 14.58717236
Iteration 3, loss = 11.54104305
Iteration 4, loss = 8.81932221
Iteration 5, loss = 7.56616314
Iteration 6, loss = 7.17548975
Iteration 7, loss = 7.03696358
Iteration 8, loss = 6.96199452
Iteration 9, loss = 6.90619453
Iteration 10, loss = 6.85582784
Iteration 11, loss = 6.80092222
Iteration 12, loss = 6.74019858
Iteration 13, loss = 6.67341266
Iteration 14, loss = 6.59591744
Iteration 15, loss = 6.50807902
Iteration 16, loss = 6.41366245
Iteration 17, loss = 6.31278720
Iteration 18, loss = 6.20920019
Iteration 19, loss = 6.10435972
Iteration 20, loss = 6.00082003
Iteration 21, loss = 5.89905077
Iteration 22, loss = 5.79965657
Iteration 23, loss = 5.70321720
Iteration 24, loss = 5.60890072
Iteration 25, loss = 5.52051110
Iteration 26, loss = 5.43368146
Iteration 27, loss = 5.35090457
Iteration 28, loss = 5.27039010
Iteration 29, loss = 5.19301396
Iteration 30, loss = 5.11864952
Iteration 31, loss = 5.04708423
Iteration 32, 

In [60]:
# Realizar predicciones en el conjunto de prueba
y_pred_genre = mlp.predict_proba(X_test)

# Impresión del desempeño del modelo
roc_auc_score(y_test_genres, y_pred_genre, average='macro')

0.8603483917067614

In [143]:
# transformación variables predictoras X del conjunto de test
X_test_dtm = vectorizer.transform(dataTesting['plot'])

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

# Predicción del conjunto de test
y_pred_test_genres = mlp.predict_proba(X_test_dtm)

In [65]:
# Guardar predicciones en formato exigido en la competencia de kaggle
res = pd.DataFrame(y_pred_test_genres, index=dataTesting.index, columns=cols)
res.to_csv(r'C:\Users\afval\Documents\MIAD_ML\MIAD_ML_NLP_2023\Semana 7\pred_genres_text_RF_7.csv', index_label='ID')
res.head()

Unnamed: 0,p_Action,p_Adventure,p_Animation,p_Biography,p_Comedy,p_Crime,p_Documentary,p_Drama,p_Family,p_Fantasy,...,p_Musical,p_Mystery,p_News,p_Romance,p_Sci-Fi,p_Short,p_Sport,p_Thriller,p_War,p_Western
1,0.005466,0.01485,0.001931,0.019259,0.4768,0.012091,0.00275,0.525034,0.007663,0.21767,...,0.073507,0.152863,0.000339,0.98108,0.006442,0.002764,0.001071,0.070699,0.00311,0.004702
4,0.002365,0.000155,7.6e-05,0.120105,0.386929,0.523275,0.008392,0.943161,7.6e-05,8.8e-05,...,0.002817,0.00379,0.000226,0.022294,0.000186,0.001297,0.015079,0.0577,0.008657,0.015159
5,0.018564,0.001337,0.000288,0.066645,0.025571,0.766762,0.017019,0.888047,0.000611,0.001949,...,0.003354,0.215315,0.000565,0.091859,0.00634,0.001776,0.004652,0.430475,0.007712,0.005762
6,0.023116,0.021593,0.001166,0.023622,0.022662,0.006396,0.011398,0.85979,0.003316,0.010224,...,0.005859,0.04775,0.000465,0.082935,0.0606,0.002268,0.004601,0.188634,0.056691,0.010856
7,0.002083,0.001555,0.001241,0.004288,0.131733,0.012011,0.016499,0.235905,0.002023,0.025313,...,0.005115,0.058153,0.000549,0.023811,0.353568,0.003937,0.001438,0.09199,0.001398,0.002557


In [208]:
import spacy

# Cargar el modelo de idioma de spaCy
nlp = spacy.load('es_core_news_sm')

# Función de lematización
def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

# Ejemplo de texto
texto = "Los gatos están saltando en el jardín."

# Lematizar el texto
texto_lemmatizado = lemmatize_text(texto)

# Imprimir el texto lematizado
print(texto_lemmatizado)

el gato estar saltar en el jardín .
