# N-Grams: TBMM

Importing needed libraries.

In [71]:
import os
import re
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.corpus import stopwords 
from nltk.tag import pos_tag
import pandas as pd

Defining the functions to be used. 

In [210]:
def clean_text(text):
    
    """This function will process the text and clean it before extracting the collocations
    """
    words=re.sub("[IVX]+\\.","", text) #roman numbers
    words = re.split(r'\W+', words)  #punctionation
    string_words = ' '.join((item for item in words if not item.isdigit())) #numbers
    tokens = [token for token in string_words.split(" ") if (token != "" and len(token)>1)] 
    return tokens

def load_donem_data(donem_number):
    
    """Returns all the text found in a donem directory"""
    
    donem_text = ''
    donem_dir_path = os.path.join(os.getcwd(), f'corpus/donem{donem_number}') 
    walks = os.walk(donem_dir_path)
    for walk in walks:
        path, dirs, files = walk
        for file in files:
            file_path = os.path.join(path, file)
            if not file_path.endswith('txt'): continue  # avoid reading .DS_Store files (for mac users)
            with open(file_path, 'r') as f:
                donem_text += f.read()
                break
    return clean_text(donem_text)


def tag_collocations(collocations):
    
    ''' part of speech tagging for all the collocations passed'''
    
    freq_collocation_tag = []
    for collocation in collocations_freqs:
        frequency_of_collocation = collocation[1]
        tag_of_collocation = pos_tag([collocation[0][0]])[0][1], pos_tag([collocation[0][1]])[0][1]
        freq_collocation_tag_tuple = frequency_of_collocation, collocation[0], tag_of_collocation
        freq_collocation_tag.append(freq_collocation_tag_tuple)
        
    return freq_collocation_tag

def pos_filter(tagged_collocations, filter_tags_list):
    
    ''' Takes a set of tuples  containing the tags to be filtered and returns
        filtered version of the collocations
        e.g. filter_list = set([('NN', 'NN'), ('AN', 'NN')])
    '''
    filtered_collocations = []
    for collocation in tagged_collocations:
        tag = collocation[2]
        if tag not in filter_tags_list:
            filtered_collocations.append(collocation)
    print(filtered_collocations)
    return filtered_collocations

def collocations_by_frequency(donem_number):
    
    """Returns bigrams(collocations) list given a donem_number """
    
    donem_text = load_donem_data(donem_number)
    collocations = list(ngrams(donem_text, 2)) # extracting bigrams
    collocations_freqs = Counter(collocations)
    collocations_freqs = sorted(collocations_freqs.items(), key=lambda kv: kv[1], reverse=True)[:1]
    tagged_collocations = tag_collocations(collocations_freqs)
    pos_filter_list = set([('NN', 'NA'), ('AN', 'NN')])
    filtered_collocations = pos_filter(tagged_collocations, pos_filter_list)
    return filtered_collocations


### Donem<span style="color:red;"> 20<span>

In [212]:
collocations = collocations_by_frequency(20)

[(15253, ('Türkiye', 'Büyük'), ('NN', 'NN')), (15135, ('Büyük', 'Millet'), ('NN', 'NN')), (14458, ('Sayın', 'Başkan'), ('NN', 'NN')), (11883, ('Millet', 'Meclisi'), ('NN', 'NN')), (10906, ('soru', 'önergesi'), ('NN', 'NN')), (9785, ('sıralarından', 'alkışlar'), ('NN', 'NN')), (8527, ('BAŞKAN', 'Sayın'), ('NN', 'NN')), (7968, ('Meclisi', 'Başkanlığına'), ('NN', 'NN')), (7759, ('yazılı', 'soru'), ('NN', 'NN')), (6701, ('geliş', 'tarihi'), ('NN', 'NN'))]


### Donem<span style="color:red;"> 21<span>

### Donem<span style="color:red;"> 22<span>

### Donem<span style="color:red;"> 23<span>

### Donem<span style="color:red;"> 24<span>

### Donem<span style="color:red;"> 25<span>

### Donem<span style="color:red;"> 26<span>

### Donem<span style="color:red;"> 27<span>