# Qué hemos hecho?

- Lanzamos cluster en aws
- Nos conectamos por consola y ssh al cluster desde el pc personal
- Instalamos jupyterhub y clonamos el  [repo](https://github.com/camilaMejia/trabajoFinal) del proyecto con este notebook listo. (En el github hay un archivo que se llama launch.txt donde están todas las intrucciones que se lanzan por comando)
- Instalamos y cargamos todas las librerias necesarias.
- Nos traemos el .dat y el .csv desde S3 al almacenamiento local
- Creamos el indice invertido usando metapy
- Hacemos querying usando BM25
- Se hace un LDA con todos las noticias (solo content + title)
- Para cada noticia hacemos vemos cual es el topico dominante

# Con respecto a la entrega

- Almacenamiento y Cluster de procesamiento en SparkML/Meta/NLTK en Amazon AWS : Check (Peso 40%)
- Indexación, búsqueda y recuperación con META : Check (Peso 40%)
- Modelado de tópicos: Check (Peso 10%)
- Análisis de sentimientos: Pendiente (Peso 10%)

En general estamos al 90% de ejecución

## Instalar librerias y complementos

In [None]:
! pip install pandas
! pip install pyspark
! pip install metapy
! pip install boto3
!pip install nltk
!pip install numpy
!pip install re
!pip install codecs
!pip install matplotlib

# Cargar librerias

In [None]:
import pandas as pd
import pyspark
from pyspark.sql import SQLContext
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
import metapy
import requests, zipfile, io, os, boto3

import nltk
import pandas as pd
import numpy as np
import re
import codecs

from nltk.corpus import stopwords



nltk.download('punkt')
nltk.download('stopwords')

 
stop_words_nltk = set(stopwords.words('english'))

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA, BisectingKMeans
from pyspark.sql.functions import monotonically_increasing_id
import re
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from pyspark.sql.types import StringType

sc = SparkContext('local', "app-topic-detection") 
spark = SparkSession(sc)

# Cargar datos necesarios 

In [None]:
s3 =  boto3.client('s3', region_name='us-east-1')
with open('data/news/news.dat', 'wb') as f:
    s3.download_fileobj('finaltext','news.dat', f)



obj = s3.get_object(Bucket='finaltext', Key=u'news.csv')
df = pd.read_csv(obj['Body'])

df['all']=df.title + df.content

df2=df[['all']]
df2.to_csv('aux.csv')
df2.head(1000).to_csv('mini.csv')

## Inverted index using metapy


In [None]:
#!rm -rf news-idx
idx = metapy.index.make_inverted_index('miniconfig.toml')


# IR: Querys

In [None]:
ranker = metapy.index.OkapiBM25()
query = metapy.index.Document()
query.content('Trump hates china') # query from AP news
top_docs = ranker.score(idx, query, num_results=5)

index=[tup[0] for tup in top_docs]
df.loc[index,['title','content']]


# LDA on spark

### Pre process data

Here we load data to spark and make some preprocessing over the text

In [None]:
#rawdata=spark.read.csv('aux.csv', inferSchema=True, header=True)

from nltk.stem import  LancasterStemmer
from nltk.stem import WordNetLemmatizer

rawdata = spark.read.load("mini.csv", format="csv", header=True)

rawdata["all"].cast(StringType())

from nltk.tokenize import word_tokenize



def cleanup_text(record):
    text  = record[1]
    uid   = record[0]
    

    # Default list of Stopwords
    stopwords_core =  =stopwords.words('english')
    
    # Custom List of Stopwords - Add your own here
    stopwords_custom = ['a', u'about', u'above', u'after']
    sw = stopwords_core + stopwords_custom

    tokens = word_tokenize(text)
    tokens = [ re.sub(r'[^A-Za-z0-9]+','',w) for w in tokens ] #Just letters and numbers
    tokens = [ w.lower() for w in tokens] #lowercase
    tokens = [ w for w in tokens if (len(w)>1) ] #Not single letter words
    tokens = [ w for w in tokens if w not in sw ] #Remove stopwords
    
    wnl=WordNetLemmatizer()
    tokens=[wnl.lemmatize(token,pos='v') for token in tokens]
    tokens=[wnl.lemmatize(token,pos='a') for token in tokens]
    tokens=[wnl.lemmatize(token,pos='n') for token in tokens]
    stemmer = LancasterStemmer ()
    tokens =[stemmer.stem(token) for token in tokens]
    return tokens

udf_cleantext = udf(cleanup_text , ArrayType(StringType()))
clean_text = rawdata.withColumn("words", udf_cleantext(struct([rawdata[x] for x in rawdata.columns])))

### Embedings + LDA

here we create the features of each line and then make the LDA itself with k topics

In [None]:
# Term Frequency Vectorization  - Option 2 (CountVectorizer)    : 
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize = 1000)
cvmodel = cv.fit(clean_text)
featurizedData = cvmodel.transform(clean_text)

vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# Generate 25 Data-Driven Topics:
lda = LDA(k=5, seed=123, optimizer="em", featuresCol="features")

ldamodel = lda.fit(rescaledData)

#model.isDistributed()
#model.vocabSize()

ldatopics = ldamodel.describeTopics()
#ldatopics.show(25)

def map_termID_to_Word(termIndices):
    words = []
    for termID in termIndices:
        words.append(vocab_broadcast.value[termID])
    
    return words

udf_map_termID_to_Word = udf(map_termID_to_Word , ArrayType(StringType()))
ldatopics_mapped = ldatopics.withColumn("topic_desc", udf_map_termID_to_Word(ldatopics.termIndices))

### Show topics

In [None]:
ldatopics_mapped.select(ldatopics_mapped.topic, ldatopics_mapped.topic_desc).show(50,False)

### Add detected topic to each line

In [None]:
ldaResults = ldamodel.transform(rescaledData)

ldaResults.select('all','words','features','topicDistribution').show()

### Add principal topic to each line

In [None]:
from pyspark.sql.types import IntegerType


def foo(topicDistribution):
    dom = topicDistribution[0]
    index_dom = 0
    for index in range(len(topicDistribution)):
        if (topicDistribution[index]>dom):
            dom=topicDistribution[index]
            index_dom=index
    
    return index_dom

udf_seltop = udf(foo , IntegerType())
aaa = ldaResults.withColumn("topic_prin", udf_seltop(ldaResults.topicDistribution))


aaa.select('all','topic_prin').show()

# Sentiment Analysis