# NLP - HW7
### Miguel Bonilla

In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from requests import get
import re
import contractions
from sklearn.feature_extraction.text import CountVectorizer

- [1. Clustering with K-Means](#1.-Clustering-with-K-Means)
- [2. Characterize Each Cluster](#2.-Characterize-Each-Cluster)

Cluster the reviews that you collected in homework 5, by doing the following:  
1. In Python, select any one of the clustering methods covered in this course. Run it over the
collection of reviews, and show at least two different ways of clustering the reviews, e.g.,
changing k in k-Means clustering or changing where you “cut” in Agnes or Diana.  
2. Try to write a short phrase to characterize (give a natural interpretation of) what each
cluster is generally centered on semantically. Is this hard to do in some cases? If so, make
note of that fact.  
3. Explain which of the two clustering results from question 1 is preferable (if one of them is),
and why.  
Submit all of your inputs and outputs and your code for this assignment, along with a brief written
explanation of your findings

### 1. Clustering with K-Means
#### a. Loading and Normalizing Data

In [2]:
### assign headers since IMDB rejects the requests without it
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50'}

In [3]:
## load static URL list (from HW5)

url_list = pd.read_csv("https://raw.githubusercontent.com/boneeyah/DS7337/main/mb_hw5_urls.csv")

In [4]:
# function goes through the table with the URLs to get each direct URL
# Parses through the content of each URL to grab the main review
# tokenizes the sentences of each review
# returns a dataframe with the movie title, review id, and the setence tokens
def grab_review(links_table):
    text = []
    for i in range(len(links_table)):
        review = get(links_table.url[i],headers)
        review_soup = BeautifulSoup(review.content, 'html.parser')
        text.append(review_soup.find(class_='text show-more__control').text)
    return(pd.DataFrame({'movie':links_table.movie,
                         'review':links_table.review,
                         'text':text                         
                        }))

In [5]:
review_text = grab_review(url_list)

In [169]:
special = ['\x96',':',',','-','(',')','[',']','–','/','#','``',';','.','&','"',"''",'?','!','....','--','...','*','..',"'"]
stop_words = nltk.corpus.stopwords.words('english') + ['movie','film','horror', 'thing','quiet','place','alien','covenant','shining','films']
special = stop_words + ["'s","'t","'d","'ll","'m","'re","'ve","n't"] + special

In [170]:
def normalize_list(review_list):
    term = [word_tokenize(term.lower()) for term in review_list]
    blank_list = []
    for i in range(len(review_list)):
        blank_list.append(' '.join([w for w in term[i] if w not in special]))
    return(np.array(blank_list))

In [171]:
norm_corpus = normalize_list(review_text.text)

#### b. K-Means with K = 6

In [172]:
cv = CountVectorizer(ngram_range=(1,2), max_df=.8,min_df=25)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix.shape

(100, 40)

In [173]:
from sklearn.cluster import KMeans
NUM_Clusters = 6
km = KMeans(n_clusters=NUM_Clusters, max_iter=1000,n_init=500,random_state=326).fit(cv_matrix)
km

In [174]:
from collections import Counter
Counter(km.labels_)

Counter({2: 7, 3: 51, 0: 23, 1: 8, 4: 3, 5: 8})

In [175]:
review_text['kmeans_cluster'] = km.labels_

In [176]:
movie_clusters = (review_text[['movie','review','kmeans_cluster']].sort_values(by='kmeans_cluster',ascending=False))

In [177]:
feature_names = cv.get_feature_names_out()
topn_features = 10
ordered_centroids = km.cluster_centers_.argsort()[:,::-1]
for cluster_num in range(NUM_Clusters):
    key_features = [feature_names[index] for index in ordered_centroids[cluster_num,:topn_features]]
    movies = movie_clusters[movie_clusters['kmeans_cluster'] == 
                           cluster_num]['movie'].value_counts().to_string()
    print('CLUSTER#'+str(cluster_num+1))
    print('Key Features:',key_features)
    print('Movies:\n', movies,sep='')
    print('---------------------')

CLUSTER#1
Key Features: ['one', 'time', 'seen', 'see', 'the', 'best', 'ever', 'good', 'well', 'world']
Movies:
The_Shining      11
The_Thing         8
A_Quiet_Place     4
---------------------
CLUSTER#2
Key Features: ['like', 'even', 'story', 'movies', 'make', 'one', 'would', 'making', 'never', 'characters']
Movies:
Alien_Covenant    3
The_Shining       2
A_Quiet_Place     2
The_Thing         1
---------------------
CLUSTER#3
Key Features: ['one', 'much', 'best', 'story', 'also', 'well', 'way', 'like', 'made', 'even']
Movies:
The_Thing         3
The_Shining       2
Alien_Covenant    1
A_Quiet_Place     1
---------------------
CLUSTER#4
Key Features: ['get', 'even', 'one', 'characters', 'like', 'time', 'good', 'well', 'see', 'much']
Movies:
Alien_Covenant    19
A_Quiet_Place     12
The_Thing         11
The_Shining        9
---------------------
CLUSTER#5
Key Features: ['also', 'the', 'like', 'would', 'story', 'one', 'make', 'could', 'even', 'many']
Movies:
The_Thing      2
The_Shining  

#### c. K-Means with K = 3

In [188]:
NUM_Clusters = 3
km2 = KMeans(n_clusters=NUM_Clusters, max_iter=1000,n_init=500,random_state=326).fit(cv_matrix)
km2

In [189]:
Counter(km2.labels_)

Counter({0: 10, 2: 57, 1: 33})

In [190]:
df = review_text.copy(deep=True)
df['kmeans_cluster'] = km2.labels_

In [191]:
movie_clusters2 = (df[['movie','review','kmeans_cluster']].sort_values(by='kmeans_cluster',ascending=False))

In [192]:
topn_features = 10
ordered_centroids2 = km2.cluster_centers_.argsort()[:,::-1]
for cluster_num in range(NUM_Clusters):
    key_features = [feature_names[index] for index in ordered_centroids2[cluster_num,:topn_features]]
    movies = movie_clusters2[movie_clusters2['kmeans_cluster'] == 
                           cluster_num]['movie'].value_counts().to_string()
    print('CLUSTER#'+str(cluster_num+1))
    print('Key Features:',key_features)
    print('Movies:\n', movies,sep='')
    print('---------------------')

CLUSTER#1
Key Features: ['one', 'also', 'story', 'time', 'like', 'much', 'could', 'the', 'even', 'well']
Movies:
The_Thing         5
The_Shining       3
Alien_Covenant    2
---------------------
CLUSTER#2
Key Features: ['one', 'like', 'would', 'time', 'well', 'the', 'good', 'best', 'seen', 'even']
Movies:
The_Shining       12
A_Quiet_Place     10
The_Thing          8
Alien_Covenant     3
---------------------
CLUSTER#3
Key Features: ['get', 'like', 'even', 'one', 'characters', 'good', 'would', 'time', 'see', 'well']
Movies:
Alien_Covenant    20
A_Quiet_Place     15
The_Thing         12
The_Shining       10
---------------------


### 2. Characterize Each Cluster

In [32]:
from sklearn.cluster import BisectingKMeans

bisect_means = BisectingKMeans(n_clusters=3,random_state=328).fit(cv_matrix)
Counter(bisect_means.labels_)

Counter({0: 11, 2: 75, 1: 14})

In [33]:
from sklearn.cluster import AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=2,compute_full_tree=True).fit(cv_matrix.toarray())
Counter(agglo.labels_)

Counter({0: 15, 1: 85})