# Basic NLP with Python and NLTK - 2nd notebook
Bruni Francesco (@brunifrancesco)

This notebook @ https://github.com/brunifrancesco/nltk_base.git

## Previous notebook

- Basic Python (functions, simple data structures, handling files)
- Nltk foundations: tokenizing/cleaning/stemming

## This notebook

- PMI computation
- Writing result to CSV file

## Pointwise Mutual Information

<br />
$${\displaystyle PMI(X=x,Y=y)=\log {\frac {p(X=x,Y=y)}{p(X=x)p(Y=y)}}}$$
<br />
<br />
where (*X and Y are independent*)
<br />
<br />
<br />
$${\displaystyle p(X=x,Y=y) = p(X=x)*p(Y=y)}$$


## Proposed pipeline

- Read from csv
- Preprocess data (tokenize, lower, remove stopwords, punctuation)
- Find frequency distribution for unigrams
- Find frequency distribution for bigrams
- Compute PMI via implemented function
- Let NLTK sort bigrams by PMI metric
- Write result to CSV file

In [1]:
import nltk
from nltk.corpus import stopwords
import string
import random
from itertools import chain
import math
import csv
import time


def read_data():
    """
    Read data 'libe by line'"""
    with open('data.csv', 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            yield row


In [133]:
def preprocess(data):
    """
    Preprocess data, filtering out stopwords, punctuation and lowering 
    all splitted tokens
    
    :param data: the string data to be processed
    """
    italian_stopwords = stopwords.words('italian')
    splitted_chunks = data.split()
    lowered_chunks = (item.lower() for item in splitted_chunks)
    chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in string.punctuation)
    chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in italian_stopwords)
    return list(chunks_without_stopwords)

In [123]:
FREQUENCY_TRESHOLD = 2

def find_bigrams(splitted_chunks):
    """
    Find bigrams and filter them by frequency threshold
    
    :param splitted_chunks: a list of chunks
    """
    bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks)
    bigrams.apply_freq_filter(FREQUENCY_TRESHOLD)
    return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()}

def find_unigrams(splitted_chunks):
    """
    Find unigrams and filter them by frequency threshold
    
    :param splitted_chunks: a list of chunks
    """
    unigrams = nltk.FreqDist(splitted_chunks)
    return {unigram: freq for unigram, freq in unigrams.items() if freq > FREQUENCY_TRESHOLD - 1}

In [100]:
def pmi(word1, word2, unigram_freq, bigram_freq):
    """
    Find PMI measure
    
    :param word1: the first word
    :param word2: the second word
    :param unigram_freq: the unigram frequency container
    :param bigram_freq: the bigram frequency container
    
    """
    prob_word1 = unigram_freq[word1] / sum(unigram_freq.values())
    prob_word2 = unigram_freq[word2] / sum(unigram_freq.values())
    prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values())
    a = prob_word1_word2/prob_word1*prob_word2
    return round(math.log(a,2),2)

In [124]:
def find_best_bigrams(chunks):
    """
    Find best bigrams via nltk bigram metric association
    
    :param splitted_chunks: a list of chunks
    """
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_words(chunks)
    finder.apply_freq_filter(FREQUENCY_TRESHOLD)
    return finder.nbest(bigram_measures.pmi, 10)

In [152]:
def write_data(result):
    """
    Write result to CSV file
    
    :param result: the list to be written to csv file
    """
    with open("result.csv", "a") as output:
        writer = csv.writer(output, delimiter='*')
        for row in result:
            writer.writerow(row)

In [153]:
import itertools
import pprint

# get the first five rows
top_five_rows = itertools.islice(read_data(), 1, 30)

# iterate over the rows and apply the pipeline
for row in top_five_rows:
    
    splitted_chunks = preprocess(row[3])
    bigrams = find_bigrams(splitted_chunks)
    unigrams = find_unigrams(splitted_chunks)
    data = [
        (" ".join(key), pmi(key[0], key[1],unigrams, bigrams )) for key in bigrams.keys()
    ]
    nltk_data = find_best_bigrams(splitted_chunks)
    
    # if somethin 'valuable' has been found, let's print it out
    if data:
        print()
        print()
        print("Processing row: %s..." %row[3][:20])
        print("*****************************************************************************************************")
        pprint.pprint("Found (via pmi computation): %s" %data)
        print("**********************************************************************")
        pprint.pprint("Found (via nltk pmi metric association): %s" %nltk_data)
        print("*****************************************************************************************************")
        print()
        print()
        print("-----------------------------------------------------------------------------------------------------")
        write_data(data)



Processing row: (FI),ricorda che nel...
*****************************************************************************************************
"Found (via pmi computation): [('reato tortura', -0.42)]"
**********************************************************************
"Found (via nltk pmi metric association): [('reato', 'tortura')]"
*****************************************************************************************************


-----------------------------------------------------------------------------------------------------


Processing row: rileva che l'estensi...
*****************************************************************************************************
"Found (via pmi computation): [('reato tortura', 0.0)]"
**********************************************************************
"Found (via nltk pmi metric association): [('reato', 'tortura')]"
*****************************************************************************************************


--------------