# TBMM Dataset

### Importing Libraries

In [None]:
import os
import re
from nltk.util import ngrams
from nltk.corpus import stopwords 
import plotly.express as px

### Cleaning the Data

In [None]:
#cleaning data
def cleanData_with_stopwords(text):
    words=re.sub("[IVX]+\\.","", text) #roman numbers
    words = re.split(r'\W+', words)  #punctionation
    string_words = ' '.join((item for item in words if not item.isdigit())) #numbers
    tokens = [token for token in string_words.split(" ") if (token != "" and len(token)>1)]
    tokensList = []
    for token in tokens:
        tokensList.append(token.lower())
    return tokensList

In [None]:
#cleaning data
def cleanData_without_stopwords(text):
    words=re.sub("[IVX]+\\.","", text) #roman numbers
    words = re.split(r'\W+', words)  #punctionation
    string_words = ' '.join((item for item in words if not item.isdigit())) #numbers
    stop_words = set(stopwords.words('turkish')) 
    tokens = [token for token in string_words.split(" ") if (token != "" and len(token)>1 and token not in stop_words)]
    tokensList = []
    for token in tokens:
        tokensList.append(token.lower())
    return tokensList

### Loading the Data

In [None]:
#loading the data for each donem according to its number
def loadData(donemNumber, stopwords):
    donem = os.listdir(f"/Users/nurhandeakyuz/Downloads/NLP/{donemNumber}")
    for yil in donem:
        textFiles = os.listdir(f"/Users/nurhandeakyuz/Downloads/NLP/{donemNumber}/{yil}")
        for textFile in textFiles: 
            fileHandler = open(f"/Users/nurhandeakyuz/Downloads/NLP/{donemNumber}/{yil}/{textFile}","r")
            text = fileHandler.read()
            fileHandler.close()
            if stopwords:
                data = cleanData_with_stopwords(text)
            else:
                data = cleanData_without_stopwords(text)
            return data

### Extracting n-grams with Stopwords

In [None]:
#extracting nGrams
def extractNGrams(donemNumber, nGrams, stopwords=True):
    data = loadData(donemNumber,stopwords)
    result = list(ngrams(data, nGrams))
    return result

### For Each Donem

In [None]:
unigram_uniques=[]
bigram_uniques=[]
trigram_uniques=[]

### Donem 20

In [None]:
unigram_20 = extractNGrams('donem20', 1)
bigram_20 = extractNGrams('donem20', 2)
trigram_20 = extractNGrams('donem20', 3)

In [None]:
# NOTE : STOP WORD REFERENCE IS BAD! ONLY REMOVING "ve"
unigram_20_WS = extractNGrams('donem20', 1, stopwords= False)
frequencyForUnigram20_WS = {x:unigram_20_WS.count(x) for x in unigram_20_WS}
topFrequencyUnigram20_WS = sorted(frequencyForUnigram20_WS.items(), key=lambda kv: kv[1])[-10:]

In [None]:
topFrequencyUnigram20_WS

In [None]:
### Unique words within different Ngrams in DONEM 20
frequencyForUnigram20 = {x:unigram_20.count(x) for x in unigram_20}
frequencyForBigram20 = {x:bigram_20.count(x) for x in bigram_20}
frequencyForTrigram20 = {x:trigram_20.count(x) for x in trigram_20}
unique_unigram20= (sum(value == 1 for value in frequencyForUnigram20.values()))
unigram_uniques.append(unique_unigram20)
unique_bigram20= (sum(value == 1 for value in frequencyForBigram20.values()))
bigram_uniques.append(unique_bigram20)
unique_trigram20= (sum(value == 1 for value in frequencyForTrigram20.values()))
trigram_uniques.append(unique_trigram20)

In [None]:
#Plotting the top 10 frequent words in Unigram _ donem20
topFrequencyUnigram20_S = sorted(frequencyForUnigram20.items(), key=lambda kv: kv[1])[-10:]
wordsUnigram20 = []
freqUnigram20 = []
for elem1,elem2 in topFrequencyUnigram20:
        freqUnigram20.append(elem2)
        for i in elem1:
            wordsUnigram20.append(i)
wordsUnigram20 = wordsUnigram20[::-1]
freqUnigram20 = freqUnigram20[::-1]
import plotly.express as px
fig = px.bar(x=wordsUnigram20, y=freqUnigram20, labels={'x':'words', 'y':'frequency of words'}, 
             title='Top 10 Frequent Words for Unigrams', color_discrete_sequence=['indianred'])
fig.show()

In [None]:
## Top 10 frequent words in bigram _ donem20
frequencyForBigram20 = {x:bigram_20.count(x) for x in bigram_20}
topFrequencyBigram20 = sorted(frequencyForBigram20.items(), key=lambda kv: kv[1])[-10:]

### Donem 21

In [None]:
unigram_21 = extractNGrams('donem21', 1)
bigram_21 = extractNGrams('donem21', 2)
trigram_21 = extractNGrams('donem21', 3)

In [None]:
### Unique words within different Ngrams in DONEM 21
frequencyForUnigram21 = {x:unigram_21.count(x) for x in unigram_21}
frequencyForBigram21= {x:bigram_21.count(x) for x in bigram_21}
frequencyForTrigram21 = {x:trigram_21.count(x) for x in trigram_21}
unique_unigram21= (sum(value == 1 for value in frequencyForUnigram21.values()))
unigram_uniques.append(unique_unigram21)
unique_bigram21= (sum(value == 1 for value in frequencyForBigram21.values()))
bigram_uniques.append(unique_bigram21)
unique_trigram21= (sum(value == 1 for value in frequencyForTrigram21.values()))
trigram_uniques.append(unique_trigram21)

In [None]:
## Top 10 frequent words in unigram _ donem21
frequencyForUnigram21 = {x:unigram_21.count(x) for x in unigram_21}
topFrequencyUnigram21 = sorted(frequencyForUnigram21.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in bigram _ donem21
frequencyForBigram21 = {x:bigram_21.count(x) for x in bigram_21}
topFrequencyBigram21 = sorted(frequencyForBigram21.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in unigram _ donem21
frequencyForTrigram21 = {x:trigram_21.count(x) for x in trigram_21}
topFrequencyTrigram21 = sorted(frequencyForTrigram21.items(), key=lambda kv: kv[1])[-10:]

### Donem 22

In [None]:
unigram_22 = extractNGrams('donem22', 1)
bigram_22 = extractNGrams('donem22', 2)
trigram_22 = extractNGrams('donem22', 3)

In [None]:
### Unique words within different Ngrams in DONEM 22
frequencyForUnigram22 = {x:unigram_22.count(x) for x in unigram_22}
frequencyForBigram22 = {x:bigram_22.count(x) for x in bigram_22}
frequencyForTrigram22 = {x:trigram_22.count(x) for x in trigram_22}
unique_unigram22= (sum(value == 1 for value in frequencyForUnigram22.values()))
unigram_uniques.append(unique_unigram22)
unique_bigram22= (sum(value == 1 for value in frequencyForBigram22.values()))
bigram_uniques.append(unique_bigram22)
unique_trigram22= (sum(value == 1 for value in frequencyForTrigram22.values()))
trigram_uniques.append(unique_trigram22)

In [None]:
## Top 10 frequent words in unigram _ donem22
frequencyForUnigram22 = {x:unigram_22.count(x) for x in unigram_22}
topFrequencyUnigram22 = sorted(frequencyForUnigram22.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in bigram _ donem22
frequencyForBigram22 = {x:bigram_22.count(x) for x in bigram_22}
topFrequencyBigram22 = sorted(frequencyForBigram22.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in trigram _ donem22
frequencyForTrigram22 = {x:trigram_22.count(x) for x in trigram_22}
topFrequencyTrigram22 = sorted(frequencyForTrigram22.items(), key=lambda kv: kv[1])[-10:]

### Donem 23

In [None]:
unigram_23 = extractNGrams('donem23', 1)
bigram_23 = extractNGrams('donem23', 2)
trigram_23 = extractNGrams('donem23', 3)

In [None]:
### Unique words within different Ngrams in DONEM 23
frequencyForUnigram23 = {x:unigram_23.count(x) for x in unigram_23}
frequencyForBigram23 = {x:bigram_23.count(x) for x in bigram_23}
frequencyForTrigram23 = {x:trigram_23.count(x) for x in trigram_23}
unique_unigram23= (sum(value == 1 for value in frequencyForUnigram23.values()))
unigram_uniques.append(unique_unigram23)
unique_bigram23= (sum(value == 1 for value in frequencyForBigram23.values()))
bigram_uniques.append(unique_bigram23)
unique_trigram23= (sum(value == 1 for value in frequencyForTrigram23.values()))
trigram_uniques.append(unique_trigram23)

In [None]:
## Top 10 frequent words in unigram _ donem23
frequencyForUnigram23 = {x:unigram_23.count(x) for x in unigram_23}
topFrequencyUnigram23 = sorted(frequencyForUnigram23.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in bigram _ donem23
frequencyForBigram23 = {x:bigram_23.count(x) for x in bigram_23}
topFrequencyBigram23 = sorted(frequencyForBigram23.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in trigram _ donem23
frequencyForTrigram23 = {x:trigram_23.count(x) for x in trigram_23}
topFrequencyTrigram23 = sorted(frequencyForTrigram23.items(), key=lambda kv: kv[1])[-10:]

### Donem 24

In [None]:
unigram_24 = extractNGrams('donem24', 1)
bigram_24 = extractNGrams('donem24', 2)
trigram_24 = extractNGrams('donem24', 3)

In [None]:
### Unique words within different Ngrams in DONEM 24
frequencyForUnigram24 = {x:unigram_24.count(x) for x in unigram_24}
frequencyForBigram24 = {x:bigram_24.count(x) for x in bigram_24}
frequencyForTrigram24 = {x:trigram_24.count(x) for x in trigram_24}
unique_unigram24= (sum(value == 1 for value in frequencyForUnigram24.values()))
unigram_uniques.append(unique_unigram24)
unique_bigram24= (sum(value == 1 for value in frequencyForBigram24.values()))
bigram_uniques.append(unique_bigram24)
unique_trigram24= (sum(value == 1 for value in frequencyForTrigram24.values()))
trigram_uniques.append(unique_trigram24)

In [None]:
## Top 10 frequent words in unigram _ donem24
frequencyForUnigram24 = {x:unigram_24.count(x) for x in unigram_24}
topFrequencyUnigram24 = sorted(frequencyForUnigram24.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in bigram _ donem24
frequencyForBigram24 = {x:bigram_24.count(x) for x in bigram_24}
topFrequencyBigram24 = sorted(frequencyForBigram24.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in trigram _ donem24
frequencyForTrigram24 = {x:trigram_24.count(x) for x in trigram_24}
topFrequencyTrigram24 = sorted(frequencyForTrigram24.items(), key=lambda kv: kv[1])[-10:]

### Donem 25

In [None]:
unigram_25 = extractNGrams('donem25', 1)
bigram_25 = extractNGrams('donem25', 2)
trigram_25 = extractNGrams('donem25', 3)

In [None]:
### Unique words within different Ngrams in DONEM 25
frequencyForUnigram25 = {x:unigram_25.count(x) for x in unigram_25}
frequencyForBigram25 = {x:bigram_25.count(x) for x in bigram_25}
frequencyForTrigram25 = {x:trigram_25.count(x) for x in trigram_25}
unique_unigram25= (sum(value == 1 for value in frequencyForUnigram25.values()))
unigram_uniques.append(unique_unigram25)
unique_bigram25= (sum(value == 1 for value in frequencyForBigram25.values()))
bigram_uniques.append(unique_bigram25)
unique_trigram25= (sum(value == 1 for value in frequencyForTrigram25.values()))
trigram_uniques.append(unique_trigram25)

In [None]:
## Top 10 frequent words in unigram _ donem25
frequencyForUnigram25 = {x:unigram_25.count(x) for x in unigram_25}
topFrequencyUnigram25 = sorted(frequencyForUnigram25.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in bigram _ donem25
frequencyForBigram25 = {x:bigram_25.count(x) for x in bigram_25}
topFrequencyBigram25 = sorted(frequencyForBigram25.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in trigram _ donem25
frequencyForTrigram25 = {x:trigram_25.count(x) for x in trigram_25}
topFrequencyTrigram25 = sorted(frequencyForTrigram25.items(), key=lambda kv: kv[1])[-10:]

### Donem 26

In [None]:
unigram_26 = extractNGrams('donem26', 1)
bigram_26 = extractNGrams('donem26', 2)
trigram_26 = extractNGrams('donem26', 3)

In [None]:
### Unique words within different Ngrams in DONEM 26
frequencyForUnigram26 = {x:unigram_20.count(x) for x in unigram_26}
frequencyForBigram26 = {x:bigram_20.count(x) for x in bigram_26}
frequencyForTrigram26 = {x:trigram_20.count(x) for x in trigram_26}
unique_unigram26= (sum(value == 1 for value in frequencyForUnigram26.values()))
unigram_uniques.append(unique_unigram26)
unique_bigram26= (sum(value == 1 for value in frequencyForBigram26.values()))
bigram_uniques.append(unique_bigram26)
unique_trigram26= (sum(value == 1 for value in frequencyForTrigram26.values()))
trigram_uniques.append(unique_trigram26)

In [None]:
## Top 10 frequent words in unigram _ donem26
frequencyForUnigram26 = {x:unigram_26.count(x) for x in unigram_26}
topFrequencyUnigram26 = sorted(frequencyForUnigram26.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in bigram _ donem26
frequencyForBigram26 = {x:bigram_26.count(x) for x in bigram_26}
topFrequencyBigram26 = sorted(frequencyForBigram26.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in trigram _ donem26
frequencyForTrigram26 = {x:trigram_26.count(x) for x in trigram_26}
topFrequencyTrigram26 = sorted(frequencyForTrigram26.items(), key=lambda kv: kv[1])[-10:]

### Donem 27

In [None]:

unigram_27 = extractNGrams('donem27', 1)
bigram_27 = extractNGrams('donem27', 2)
trigram_27 = extractNGrams('donem27', 3)

In [None]:
### Unique words within different Ngrams in DONEM 27
frequencyForUnigram27 = {x:unigram_27.count(x) for x in unigram_27}
frequencyForBigram27 = {x:bigram_27.count(x) for x in bigram_27}
frequencyForTrigram27 = {x:trigram_27.count(x) for x in trigram_27}
unique_unigram27= (sum(value == 1 for value in frequencyForUnigram27.values()))
unigram_uniques.append(unique_unigram27)
unique_bigram27= (sum(value == 1 for value in frequencyForBigram27.values()))
bigram_uniques.append(unique_bigram27)
unique_trigram27= (sum(value == 1 for value in frequencyForTrigram27.values()))
trigram_uniques.append(unique_trigram27)

In [None]:
## Top 10 frequent words in unigram _ donem27
frequencyForUnigram27 = {x:unigram_27.count(x) for x in unigram_27}
topFrequencyUnigram27 = sorted(frequencyForUnigram27.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in bigram _ donem27
frequencyForBigram27 = {x:bigram_27.count(x) for x in bigram_27}
topFrequencyBigram27 = sorted(frequencyForBigram27.items(), key=lambda kv: kv[1])[-10:]

In [None]:
## Top 10 frequent words in trigram _ donem27
frequencyForTrigram27 = {x:trigram_27.count(x) for x in trigram_27}
topFrequencyTrigram27 = sorted(frequencyForTrigram27.items(), key=lambda kv: kv[1])[-10:]

## Comparing The Unique Words In each Donem

In [None]:
import plotly.graph_objects as go


fig = go.Figure()
fig.add_trace(go.Bar(x=y,
                y=unigram_uniques,
                name='unigram_uniques',
                marker_color='rgb(155, 83, 109)'
                ))
fig.add_trace(go.Bar(x=y,
                y=bigram_uniques,
                name='bigram_uniques',
                marker_color='rgb(229, 204, 255)'
                ))
fig.add_trace(go.Bar(x=y,
                y=trigram_uniques,
                name='trigram_uniques',
                marker_color='rgb(0, 204, 204)'
                ))

fig.update_layout(
    title='Uniques of each Ngram',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Total number of uniques',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

### For Whole Corpus

In [None]:
Unigram_corpus = unigram_20+unigram_21+unigram_22+unigram_23+unigram_24+unigram_25+unigram_26+unigram_27
Bigram_corpus = bigram_20+bigram_21+bigram_22+bigram_23+bigram_24+bigram_25+bigram_26+bigram_27
Trigram_corpus = trigram_20+trigram_21+trigram_22+trigram_23+trigram_24+trigram_25+trigram_26+trigram_27

### Plotting the top 10 frequent words in the whole corpus

In [None]:
frequencyForUnigram_corpus = {x:Unigram_corpus.count(x) for x in Unigram_corpus}
topFrequencyUnigram_corpus = sorted(frequencyForUnigram_corpus.items(), key=lambda kv: kv[1])[-10:]
wordsUnigramCorpus = []
freqUnigramCorpus = []
for elem1,elem2 in topFrequencyUnigram_corpus:
        freqUnigramCorpus.append(elem2)
        for i in elem1:
            wordsUnigramCorpus.append(i)
wordsUnigramCorpus = wordsUnigramCorpus[::-1]
freqUnigramCorpus = freqUnigramCorpus[::-1]
fig = px.bar(x=wordsUnigramCorpus, y=freqUnigramCorpus, labels={'x':'words', 'y':'frequency of words'}, 
             title='Top 10 Frequent Words for Unigrams', color_discrete_sequence=['indianred'])
fig.show()