#Open Library Analysis - Big Data Computing Project
####Graph's Connected Components vs. k-means Clusters

##Libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

##Data Aquisition

short dataset: 50000 entries

In [0]:
%sh wget -P /tmp https://raw.githubusercontent.com/attennig/BDC_datasets/main/books_short.csv

In [0]:
dbutils.fs.mv("file:/tmp/books_short.csv", "dbfs:/bdc-2020-21/datasets/books_short.csv")

long dataset: 22.589.356 entries 

remember to download it only if it is necessary

In [0]:
%sh wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1qIhBSrpkDc-RCdbw7e1NVtNhOj_fNi5G' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1qIhBSrpkDc-RCdbw7e1NVtNhOj_fNi5G" -O /tmp/books_long.csv.bz2 && rm -rf /tmp/cookies.txt

In [0]:
dbutils.fs.ls("file:/tmp")

In [0]:
dbutils.fs.mv("file:/tmp/books_long.csv.bz2", "dbfs:/bdc-2020-21/datasets/books_long.csv.bz2")

In [0]:
dbutils.fs.ls("dbfs:/bdc-2020-21/datasets")

In [0]:
# Read dataset file into a Spark Dataframe
books_df = spark.read.load("dbfs:/bdc-2020-21/datasets/books_short.csv", 
                         format="csv", 
                         sep=";", 
                         inferSchema="true", 
                         header="true"
                         )

In [0]:
tot_entries = books_df.count();
print("The shape of the dataset is {:d} rows by {:d} columns".format(tot_entries, len(books_df.columns)))

##Data Cleaning

In [0]:
#eliminating entries with missing values or columns
columns_to_drop = []
for c in books_df.columns:
  if books_df.where(col(c).isNull()).count()/tot_entries > 0.7:
    #remove col
    columns_to_drop += [c]

In [0]:
columns_to_drop

In [0]:
books_df = books_df.drop(*columns_to_drop)

In [0]:
assert "title" in books_df.columns and "key" in books_df.columns
books_df = books_df.dropna(how="any", subset=["key", "title"])
books_df = books_df.dropDuplicates(['key'])
books_df = books_df.dropDuplicates(['title'])

In [0]:
assert "subjects" in books_df.columns and "authors" in books_df.columns
books_df = books_df.na.fill({'subjects': 'unknown', 'authors': 'unknown'})

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
# This will return a new DF with all the columns + id
books_df = books_df.withColumn("id", monotonically_increasing_id())

In [0]:
books_df.describe().show()

In [0]:
# if nltk not found
!python -m pip install nltk

!python -m pip install --upgrade pip

!python -m nltk.downloader all

In [0]:
# From Document_Clustering.ipynb
def clean_text(df, column_name):
    """ 
    This function takes the raw text data and applies a standard NLP preprocessing pipeline consisting of the following steps:
      - Text cleaning
      - Tokenization
      - Stopwords removal
      - Stemming (Snowball stemmer)

    parameter: dataframe
    returns: the input dataframe along with the `cleaned_content` column as the results of the NLP preprocessing pipeline
    """
    from pyspark.sql.functions import udf, col, lower, trim, regexp_replace
    from pyspark.ml.feature import Tokenizer, StopWordsRemover
    from nltk.stem.snowball import SnowballStemmer # BE SURE NLTK IS INSTALLED ON THE CLUSTER USING THE "LIBRARIES" TAB IN THE MENU

    
    # Text preprocessing pipeline
    # 1. Text cleaning
    # 1.a Case normalization
    lower_case_df = df.select(["id",lower(col(column_name)).alias(column_name)])
    # 1.b Trimming
    trimmed_df = lower_case_df.select(["id",trim(col(column_name)).alias(column_name)])
    # 1.c Filter out punctuation symbols
    no_punct_df = trimmed_df.select(["id",(regexp_replace(col(column_name), "[^a-zA-Z\\s]", "")).alias(column_name)])
    # 1.d Filter out any internal extra whitespace
    cleaned_df = no_punct_df.select(["id",trim(regexp_replace(col(column_name), " +", " ")).alias(column_name)])
    # 2. Tokenization (split text into tokens)
    tokenizer = Tokenizer(inputCol=column_name, outputCol="tokens")
    tokens_df = tokenizer.transform(cleaned_df).cache()
    # 3. Stopwords removal
    stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="terms")
    terms_df = stopwords_remover.transform(tokens_df).cache()
    # 4. Stemming (Snowball stemmer)
    stemmer = SnowballStemmer(language="english")
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
    terms_stemmed_df = terms_df.withColumn("terms_stemmed", stemmer_udf("terms")).cache()

    return terms_stemmed_df

In [0]:
clean_title_df = clean_text(books_df, "title")

In [0]:
clean_subjects_df = clean_text(books_df, "subjects")

##Feature Engineering

In this section we will use nlp techniques to get numerical vectors that represent text-based features

In [0]:
# final df
# |id|w2vec(clean_title)|w2vec(clean_subject)|authors|

In [0]:
RANDOM_SEED = 42 # used below to run the actual K-means clustering
EMBEDDING_SIZE = 150 # size of embedding Word2Vec vectors

In [0]:
#Word2Vec from Document_Clustering.ipynb 
def extract_w2v_features(df, column_name, out_col_name):
  from pyspark.ml.feature import Word2Vec
  
  word2vec = Word2Vec(vectorSize=EMBEDDING_SIZE, minCount=5, inputCol=column_name, outputCol=out_col_name, seed=RANDOM_SEED)
  model = word2vec.fit(df)
  features = model.transform(df).cache()
  
  return model, features

In [0]:
model, w2v_title_features = extract_w2v_features(clean_title_df, "terms_stemmed", "title_vec")

In [0]:
model, w2v_subjects_features = extract_w2v_features(clean_subjects_df, "terms_stemmed", "subjects_vec")

In [0]:
# Gli ID sono differenti, questo è un problema!!!!!!
books_df.show()
w2v_title_features.show()

In [0]:
# Final dataframe
engineered_books_df = books_df.select(["id", "authors"])
# add "title_vec" and "subjects_vec"

In [0]:
#engineered_books_df = engineered_books_df.withColumn(w2v_title_features["title_vec"])

In [0]:
#engineered_books_df = engineered_books_df.withColumn("subjects_vec", w2v_subjects_features["features"])

##Graph

In [0]:
# guardare notebook pagerank
# creare df con colonna id per i nodi
# creare df con colonne src e dst per gli archi
# def sim(u,v)
# gli archi vanno agginti bidirezionali per le entries che rispettalo la condizione sim(u,v) > eps
# crea grafo
# ottieni CC del grafo


##Clustering

In [0]:
# from Document_Clustering.ipynb  e l'altro del clustering
# k = nCC , kmeans

##Evaluation