# Prediccion del Genero de una Pelicula

## Solucion Bag of Words

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_parquet('https://github.com/amiune/freecodingtour/raw/main/cursos/espanol/deeplearning/data/train.parquet', engine='pyarrow')
df_test = pd.read_parquet('https://github.com/amiune/freecodingtour/raw/main/cursos/espanol/deeplearning/data/test.parquet', engine='pyarrow')

In [5]:
genres_list = list(df_train.genre.unique())
genres_list

['fantasy',
 'horror',
 'family',
 'scifi',
 'action',
 'crime',
 'adventure',
 'mystery',
 'romance',
 'thriller']

### Concateno el titulo y la sinopsis 

In [9]:
df_train["text"] = df_train.movie_name.str.lower() + " " + df_train.synopsis.str.lower()
df_train.head()

Unnamed: 0,id,movie_name,synopsis,genre,text
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy,super me a young scriptwriter starts bringing ...
1,50185,Entity Project,A director and her friends renting a haunted h...,horror,entity project a director and her friends rent...
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family,behavioral family therapy for serious psychiat...
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi,blood glacier scientists working in the austri...
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action,apat na anino buy day - four men widely - apar...


# Entrenar y evaluar distintos modelos

### Divido el dataset en entrenamiento y evaluacion

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df_train["text"], 
                                                  df_train["genre"], 
                                                  test_size=0.30, 
                                                  stratify=df_train["genre"],
                                                  random_state=42)

In [11]:
X_train[0]

'super me a young scriptwriter starts bringing valuable objects back from his short nightmares of being chased by a demon. selling them makes him rich.'

### Creo el bag of words utilizando CountVectorizer

In [12]:
#from sklearn.feature_extraction.text import CountVectorizer
#vectorizer = CountVectorizer(max_features=8000, stop_words='english')
#X_train_vect = vectorizer.fit_transform(X_train)

### Creo el bag of words utilizando TfidfVectorizer

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=8000, stop_words='english')
X_train_vect = vectorizer.fit_transform(X_train)

In [14]:
#Sparse data classification
#https://datascience.stackexchange.com/questions/73311/which-of-the-scikit-learn-classification-algorithms-accept-sparse-matrices
X_train_vect[0]

<1x8000 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

### Creo un modelo de regresion lineal y lo evaluo

In [15]:
#from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train_vect, y_train)
#clf.predict(X_train_vect[:2, :])

### Creo un modelo de Multinomial Naive Bayes y lo evaluo

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=10).fit(X_train_vect, y_train)
clf.predict(X_train_vect[:2, :])

array(['scifi', 'crime'], dtype='<U9')

In [17]:
from sklearn.metrics import accuracy_score
y_tain_pred = clf.predict(X_train_vect)
print("Train accuracy: ", accuracy_score(y_train, y_tain_pred))

Train accuracy:  0.4617724867724868


In [18]:
X_val_vect = vectorizer.transform(X_val)
y_val_pred = clf.predict(X_val_vect)
print("Validation accuracy: ", accuracy_score(y_val, y_val_pred))

Validation accuracy:  0.3738888888888889


# Entrenar de nuevo con toda la base de datos

In [19]:
vectorizer = TfidfVectorizer(max_features=8000, stop_words='english')
df_train_vect = vectorizer.fit_transform(df_train.text)

clf = MultinomialNB(alpha=10).fit(df_train_vect, df_train.genre)
clf.predict(df_train_vect[:2, :])

array(['horror', 'horror'], dtype='<U9')

# Predecir el dataset de test y crear el archivo a enviar

In [20]:
df_test["text"] = df_test.movie_name.str.lower() + " " + df_test.synopsis.str.lower()
df_test.head()

Unnamed: 0,id,movie_name,synopsis,genre,text
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action,a death sentence 12 y.o. ida's dad'll die with...
1,48456,Intermedio,A group of four teenage friends become trapped...,action,intermedio a group of four teenage friends bec...
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action,30 chua phai tet a guy left his home for 12 ye...
3,84007,Paranoiac,A man long believed dead returns to the family...,action,paranoiac a man long believed dead returns to ...
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action,"ordinary happiness after a deadly accident, pa..."


In [21]:
X_test = vectorizer.transform(df_test.text)

In [22]:
pred = clf.predict(X_test)

In [23]:
df_submission = pd.DataFrame({"id":df_test["id"],"genre":pred})

In [24]:
df_submission.to_csv("submission2.1.csv", index=False)

In [31]:
test_probs = clf.predict_proba(X_test)

In [32]:
test_probs[0]

array([0.12604317, 0.09757201, 0.16063046, 0.13848384, 0.07640207,
       0.0736028 , 0.07532464, 0.08137083, 0.07549634, 0.09507384])

In [33]:
clf.classes_

array(['action', 'adventure', 'crime', 'family', 'fantasy', 'horror',
       'mystery', 'romance', 'scifi', 'thriller'], dtype='<U9')

In [34]:
tmp_dict = {"id":df_test["id"]}
for j in range(len(clf.classes_)):
    tmp_dict[clf.classes_[j]] = test_probs[:,j]
df_probs = pd.DataFrame(tmp_dict)
df_probs.to_csv("multinomialnb_probs.csv", index=False)

# Fin: [Volver al contenido del curso](https://www.freecodingtour.com/cursos/espanol/deeplearning/deeplearning.html)