<a href="https://colab.research.google.com/github/abuwildanm/Text-Mining/blob/master/Texmin_1_Information_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tugas Text Mining

Nama : Abu Wildan Mucholladin

NIM : 165150200111002

In [0]:
# Import library standard
import numpy as np
import pandas as pd
import re

# Import library for indonesian stemming
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory



## Parsing

In [0]:
# Document
d1 = 'Kupu-kupu terbang di atas pohon'
d2 = 'Dia terbang sambil mencari pohon untuk bertelur'
d3 = 'Pohon tempat kupu-kupu bertelur adalah pohon mangga'
d4 = 'Kupu-kupu bertelur untuk berkembang biak'
document = np.array([d1, d2, d3, d4])

# Query
q1 = 'Kupu pohon'
q2 = 'Kupu-kupu pohon'
# query = np.array([q1])
query = np.array([q1, q2])

# Document & Query Combination
document_query = np.append(document, query)

## Preprocessing

In [0]:
def tokenizing(text):
    # Case folding
    text = text.lower()
    
    # Tokenization
    token = re.findall('[A-Za-z]+', text)
    token = np.array(token)
    
    return token

def filtering(token):
    # Stoplist
    stopWordRemoverFactory = StopWordRemoverFactory()
    stopwords = stopWordRemoverFactory.get_stop_words()

    # Remove stopwords
    stopwordRemover = ~np.isin(token, stopwords)
    term = token[stopwordRemover]
    
    return term
  
def stemming(term):
    # Create stemmer
    stemmerFactory = StemmerFactory()
    stemmer = stemmerFactory.create_stemmer()

    # Stem process
    stem = np.vectorize(lambda t: stemmer.stem(t))
    term = stem(term)
    
    return term

def preprocessing(text):
    # Lexical Analysis
    token = tokenizing(text)
    # Filtering
    term = filtering(token)
    # Stemming
    term = stemming(term)
    
    return term
  
def getFeatures(document):
    feature = [preprocessing(text) for text in document]
    feature = [term for listTerm in feature for term in listTerm]
    feature = np.unique(feature)
    return feature

In [0]:
feature = getFeatures(document_query)
feature

array(['atas', 'biak', 'cari', 'kembang', 'kupu', 'mangga', 'pohon',
       'telur', 'tempat', 'terbang'], dtype='<U7')

## Term Weighting

### Term Frequency (tf) Weighting

In [0]:
def tfWeighting(document_query, normalize=True):
    # Extract feature
    feature = getFeatures(document_query)
    
    # Term Frequency Table
    zero_data = np.zeros((document_query.shape[0], feature.shape[0]))
    tf = pd.DataFrame(zero_data, columns=feature)
    for i, text in enumerate(document_query):
        word = preprocessing(text)
        # Weighting process
        term, frequency = np.unique(word, return_counts=True)
        tf.loc[i, term] = frequency
    
    # Log Normalization
    if normalize:
       tf[tf != 0] = 1 + np.log10(tf)
    
    return tf

In [0]:
tf = tfWeighting(document_query)
tf

  app.launch_new_instance()


Unnamed: 0,atas,biak,cari,kembang,kupu,mangga,pohon,telur,tempat,terbang
0,1.0,0.0,0.0,0.0,1.30103,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.30103,1.0,1.30103,1.0,1.0,0.0
3,0.0,1.0,0.0,1.0,1.30103,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.30103,0.0,1.0,0.0,0.0,0.0


### Inverse Document Frequency (idf) Weighting

In [0]:
def idfWeighting(term_frequency_document):
    doc_freq = pd.DataFrame({'Document Frequency' : (term_frequency_document != 0).sum(axis=0)})
    doc_freq['Inverse Document Frequency'] = np.log10(term_frequency_document.shape[0] / doc_freq)
    return doc_freq

In [0]:
tf_document = tf.iloc[:-query.shape[0]]
doc_freq = idfWeighting(tf_document)
doc_freq

Unnamed: 0,Document Frequency,Inverse Document Frequency
atas,1,0.60206
biak,1,0.60206
cari,1,0.60206
kembang,1,0.60206
kupu,3,0.124939
mangga,1,0.60206
pohon,3,0.124939
telur,3,0.124939
tempat,1,0.60206
terbang,2,0.30103


### Tf-Idf Weighting

In [0]:
def tfIdfWeighting(document_query, query, normalize_tf=True, normalize_weight=True):
    tf = tfWeighting(document_query, normalize_tf)
    tf_document = tf.iloc[:-query.shape[0]]
    doc_freq = idfWeighting(tf_document)
    tf_idf = tf.mul(doc_freq['Inverse Document Frequency'], axis=1)
    
    if normalize_weight:
      tf_idf_square = tf_idf ** 2
      tf_idf_sum = tf_idf_square.sum(axis=0)
      tf_idf_sqrt = np.sqrt(tf_idf_sum)
      tf_idf = tf_idf / tf_idf_sqrt
    
    return tf_idf

In [0]:
tf_idf = tfIdfWeighting(document_query, query)
tf_idf

  app.launch_new_instance()


Unnamed: 0,atas,biak,cari,kembang,kupu,mangga,pohon,telur,tempat,terbang
0,1.0,0.0,0.0,0.0,0.46672,0.0,0.419123,0.0,0.0,0.707107
1,0.0,0.0,1.0,0.0,0.0,0.0,0.419123,0.57735,0.0,0.707107
2,0.0,0.0,0.0,0.0,0.46672,1.0,0.545292,0.57735,1.0,0.0
3,0.0,1.0,0.0,1.0,0.46672,0.0,0.0,0.57735,0.0,0.0
4,0.0,0.0,0.0,0.0,0.358731,0.0,0.419123,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.46672,0.0,0.419123,0.0,0.0,0.0


## Cosine Similarity

In [0]:
def cosineSimilarity(weight_document, weight_query, normalize_weight=True):
    cosim = []
    if normalize_weight:
      cosim = weight_document.dot(weight_query.T)
    else:
      weight_dot = weight_document.dot(weight_query.T)
      weight_document_square = weight_document ** 2
      weight_query_square = weight_query ** 2
      weight_document_sum = weight_document_square.sum(axis=1)
      weight_query_sum = weight_query_square.sum(axis=1)
      
      weight_sum = pd.DataFrame()
      for i, weight in enumerate(weight_query_sum):
        weight_sum['Query {}'.format(i)] = weight_document_sum * weight
        
      weight_sqrt = np.sqrt(weight_sum)
      cosim = weight_dot / weight_sqrt.values
    
    column_name = [('Query {}'.format(i)) for i in range(weight_query.shape[0])]
    cosim.columns = column_name
    return cosim

In [0]:
tf_idf_document, tf_idf_query = tf_idf.iloc[:-query.shape[0]], tf_idf.iloc[-query.shape[0]:]
cosim = cosineSimilarity(tf_idf_document, tf_idf_query)
cosim

Unnamed: 0,Query 0,Query 1
0,0.343092,0.393492
1,0.175664,0.175664
2,0.395972,0.446372
3,0.167427,0.217828
