In [1]:
import os
import pickle
import random
import logging
import argparse
import pandas as pd

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

from collections import defaultdict

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from timeit import default_timer as timer
from datetime import timedelta

from models import WordEncoder, Attention, TagEmbedding, WordDecoder, MSVED, KumaMSD
from dataset import MorphologyDatasetTask3, Vocabulary

from kumaraswamy import Kumaraswamy
from hard_kumaraswamy import StretchedAndRectifiedDistribution as HardKumaraswamy

from main import *

In [2]:
language = 'turkish'
f_task   = 'task1'
f_type   = 'train'

filepath = '../data/files/{}-{}-{}'.format(language, f_task, f_type)

In [3]:
with open(filepath, 'r') as f:
    source = f.read()

lines = source.strip().split('\n')

print('Total lines: {}'.format(len(lines)))

unique_x_s = defaultdict(int)
unique_x_t = defaultdict(int)
unique_x   = defaultdict(int)
unique_msd = defaultdict(int)

for line in lines:
    words = line.strip().split('\t')
    msds  = words[1].strip().split(',')

    unique_x_s[words[0]] += 1
    unique_x_t[words[2]] += 1
    unique_x[words[0]]   += 1
    unique_x[words[2]]   += 1

    for msd in msds:
        unique_msd[msd]  += 1

Total lines: 12336


In [4]:
print('Unique source str: {}'.format(len(unique_x_s)))
print('Unique target str: {}'.format(len(unique_x_t)))
print('Unique words     : {}'.format(len(unique_x)))
print('Unique MSDs      : {}'.format(len(unique_msd)))
print('\n')
# print(unique_x_s)

Unique source str: 2353
Unique target str: 12005
Unique words     : 14231
Unique MSDs      : 31




In [5]:
reduced_d = {k: v for k, v in unique_x_s.items() if v > 12}

print(reduced_d)

{'alamet': 13, 'arzu': 13, 'bakma': 16, 'bohça': 13, 'dernek': 13, 'düşman': 14, 'elbise': 13, 'kas': 14, 'keçi': 15, 'kibutz': 13, 'lastik': 13, 'müellif': 13, 'nedime': 14, 'sağ': 13, 'yeterlilik': 13}


In [6]:
with open(filepath, 'r') as f:
    source = f.read()

lines  = source.strip().split('\n')
keys   = reduced_d.keys()
output = []

for line in lines:
    words = line.strip().split('\t')

    if words[0] in keys:
        output.append(line)
        
with open('../data/{}_stem_greater_12'.format(language), 'w+') as f:
    f.write('\n'.join(output))