## Lectura de archivo

In [1]:
#@title Conectar con el drive
from google.colab import drive 
drive.mount('/content/drive') 

Mounted at /content/drive


In [2]:
#@title Librería y lectura de archivo
import pandas as pd # Importo la librería
path = "/content/drive/MyDrive/csv/Movies_Dataset.csv"  # Defino la ruta
df = pd.read_csv(path) # Leo el archivo
df.describe

<bound method NDFrame.describe of           id  ...                                           overview
0          0  ...  Led by Woody, Andy's toys live happily in his ...
1          1  ...  When siblings Judy and Peter discover an encha...
2          2  ...  A family wedding reignites the ancient feud be...
3          3  ...  Cheated on, mistreated and stepped on, the wom...
4          4  ...  Just when George Banks has recovered from his ...
...      ...  ...                                                ...
45461  45461  ...        Rising and falling between a man and woman.
45462  45462  ...  An artist struggles to finish his work while a...
45463  45463  ...  When one of her hits goes wrong, a professiona...
45464  45464  ...  In a small town live two brothers, one a minis...
45465  45465  ...  50 years after decriminalisation of homosexual...

[45466 rows x 3 columns]>

In [None]:
#@title Un poco mas de info del dataset
df.info()

In [3]:
#@title Preprocesamiento de datos
# Module can be used to extract features in a format supported by machine learning algorithms from datasets consisting of formats such as text and image.
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans # Librería KMeans

documents = df['overview'].values.astype("U") # Cambio el formato a Unicode para que quede estandarizado
vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)

k = 20 #Eligió un número aproximado para probar
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(features)

df['cluster'] = model.labels_ # Crea una columna con las etiquetas que el modelo arrojó

df.head()

Unnamed: 0,id,title,overview,cluster
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",2
1,1,Jumanji,When siblings Judy and Peter discover an encha...,4
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...,17
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",13
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...,2


In [5]:
clusters = df.groupby('cluster')   
clusters.head()

Unnamed: 0,id,title,overview,cluster
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",2
1,1,Jumanji,When siblings Judy and Peter discover an encha...,4
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...,17
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",13
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...,2
...,...,...,...,...
641,641,The Superwife,,9
800,800,Kaspar Hauser,No overview found.,8
972,972,L'associ̩,No overview found.,8
1613,1613,Alien Escape,No overview found.,8


In [6]:
for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w') # Crea un archivo
    data = clusters.get_group(cluster)[['title','overview']] # Obtiene los datos del título y del resumen
    f.write(data.to_csv(index_label='id')) # Setea el index con el id
    f.close()

print("Cluster centroids: \n")
order_centroids = model.cluster_centers_.argsort()[:, ::-1] # Acá arroja el centroide
terms = vectorizer.get_feature_names() # Usa la vectorización para obtener los feature names

for i in range(k): # Acá los recorre y los imprime. Del 1 al 20 (k)
    print("Cluster %d:" % i)
    for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster centroids: 

Cluster 0:
 new
 york
 city
 life
 young
 love
 home
 world
 years
 finds
------------
Cluster 1:
 police
 killer
 detective
 serial
 murder
 officer
 case
 cop
 investigation
 crime
------------
Cluster 2:
 life
 love
 young
 time
 father
 wife
 friends
 world
 day
 years
------------
Cluster 3:
 school
 high
 students
 teacher
 student
 friends
 girls
 girl
 new
 life
------------
Cluster 4:
 world
 earth
 evil
 gang
 help
 agent
 secret
 murder
 army
 mission
------------
Cluster 5:
 team
 los
 angeles
 world
 coach
 football
 new
 way
 time
 league
------------
Cluster 6:
 husband
 woman
 wife
 young
 life
 married
 marriage
 son
 mother
 finds
------------
Cluster 7:
 old
 year
 boy
 life
 father
 mother
 girl
 years
 son
 family
------------
Cluster 8:
 overview
 available
 movie
 plot
 freaks
 freakishly
 freakish
 freaking
 freaked
 freak
------------
Cluster 9:
 nan
 ݣ1890
 frazier
 fraw
 fray
 frayed
 fraying
 frayn
 frazer
 frazzled
------------
Cluster 

