<a href="https://colab.research.google.com/github/akihiros/ngrams/blob/master/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 準備

In [0]:
# driveのマウント：google colabでの動作を想定
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install mecab-python3

In [0]:
# 環境によってpathを指定すること
FIFAnews_E = '/content/drive/My Drive/data/NLP/text/FIFAnews-English.txt'
FIFAnews_F = '/content/drive/My Drive/data/NLP/text/FIFAnews-French.txt'
FIFAnews_G = '/content/drive/My Drive/data/NLP/text/FIFAnews-German.txt'
FIFAnews_S = '/content/drive/My Drive/data/NLP/text/FIFAnews-Spanish.txt'
result_text = '/content/drive/My Drive/data/NLP/result.txt'
test_text = '/content/drive/My Drive/data/NLP/test.txt'

# 3-5

- 文字列バイグラムの頻度を求める

In [0]:
import codecs
import collections


def n_gram(target, n):
    """n_gramを取得する関数

    Args:
        target(str): n_gramを取得したい文字列
        n(int): 1ならunigram, 2ならbigram, 3ならtrigram
    
    Returns:
        list: n_gramのリスト
    """
    return [ target[idx:idx + n] for idx in range(len(target) - n + 1)]


def counter_n(target):
    """n_gramのリストの頻度を数え、頻度順に整列する関数

    Args:
        target(str):n_gramを取得したい文字列
    """
    c = collections.Counter(n_gram(target, 2))
    print(n_gram(target, 2), c, '', sep='\n')


with codecs.open(FIFAnews_E, 'r', 'shift-jis') as f:
    counter_n(f.read())

with codecs.open(FIFAnews_F, 'r', 'shift-jis') as f:
    counter_n(f.read())

with codecs.open(FIFAnews_G, 'r', 'shift-jis') as f:
    counter_n(f.read())

with codecs.open(FIFAnews_S, 'r', 'shift-jis') as f:
    counter_n(f.read())


# 3-6

- 品詞の頻度をカウントする
- 連続する品詞の確率を計算する

In [0]:
# # 手がかり句による手法
# import codecs

# # utf-8でエラー吐いたら引数に'shift-jis'を入れること
# with codecs.open(result_text, 'r') as f:
#     countbox = list()
#     print('リストの中身は{}個です'.format(len(countbox)))
#     line = f.readline()
#     while line:
#         if '名詞' in line:
#             countbox.append('名詞')
#         elif '動詞' in line:
#             countbox.append('動詞')
#         elif '副詞' in line:
#             countbox.append('副詞')
#         elif '助詞' in line:
#             countbox.append('助詞')
#         elif '助動詞' in line:
#             countbox.append('助動詞')
#         elif '接頭詞' in line:
#             countbox.append('接頭詞')
#         elif '接続詞' in line:
#             countbox.append('接続詞')
#         elif '記号' in line:
#             countbox.append('記号')
#         elif 'EOS' in line:
#             countbox.append('EOS')
#         line = f.readline()
#     print('リストの中身は{}個です'.format(len(countbox)))

In [0]:
# 他の方法：特定の位置（品詞）を切り取って比較
import codecs

# utf-8でエラーが起きたら引数に'shift-jis'を入れること
with codecs.open(result_text, 'r') as f:
    countbox = list()
    line = f.readlines()
    for i in range(len(line)):
        try:
            parts = line[i].split('\t')[1].split(',')[0]
            countbox.append(parts)
        except IndexError:
            pass

    x_list = list(set(countbox))

# utf-8でエラーが起きたら引数に'shift-jis'を入れること
with codecs.open(result_text, 'r') as f:
    countbox = list()
    print('リストの中身は{}個です'.format(len(countbox)))
    line = f.readlines()
    for i in range(len(line)):
        if 'EOS' in line[i]:
            countbox.append('EOS')
            continue
        
        try:
            parts = line[i].split('\t')[1].split(',')[0]
            for i in range(len(x_list)):
                if x_list[i] == parts:
                    countbox.append(x_list[i])
        except IndexError:
            pass
                    
    print('リストの中身は{}個です'.format(len(countbox)))

In [0]:
from collections import defaultdict
import collections


def n_gram(target, n):
    """n_gramを取得する関数

    Args:
        target(str): n_gramを取得したい文字列
        n(int): 1ならunigram, 2ならbigram, 3ならtrigram
    
    Returns:
        list: n_gramのリスト
    """
    return [ target[idx:idx + n] for idx in range(len(target) - n + 1)]


def get_unique_list(seq):
    """2次元リストの重複する要素を整列する関数

    Args:
        seq(list): n_gramのリスト
    
    Returns:
        list
    """
    seen = []
    return [x for x in seq if x not in seen and not seen.append(x)]


def count_n_gram(target, n):
    """重複無しの品詞のn_gramを計算する関数

    Args:
        target(list)): 重複有りのn_gramのリスト
        n(int): gram数

    Returns:
        uniqu_target(list): 重複無しの品詞のn_gram
        counter(): uniqu_targetの各要素の数
    """
    unique_target = get_unique_list(target)
    counter = [0] * len(unique_target)
    for i in range(len(target)):
        for l in range(len(unique_target)):
            if target[i] == unique_target[l]:
                counter[l] += 1

    return unique_target, counter
    

c = collections.Counter(countbox)
b = n_gram(countbox, 2)
uniqu_target, counter = count_n_gram(b, 2)

print(countbox[:10], c, b, '\n', sep="\n")  # 品詞のcount, 品詞のカウント, 品詞のバイグラム
print(uniqu_target, counter, sep='\n')  # 重複無しの品詞のバイグラム, その数

In [0]:
import numpy as np

x = np.array(counter)

for i in range(len(x)):
    print('{} {} : {}%'.format(b[np.argsort(x)[::-1][i]], np.sort(x)[::-1][i], np.sort(x)[::-1][i]/len(x)))

# 4-3

- test.txtを形態素解析し、単語をカウントする
- 一般代名詞の中で頻度が多い順に１０番目までを頻度と求める

In [0]:
import MeCab
import codecs

mecab = MeCab.Tagger('-Ochasen')
mecab.parse('')

with codecs.open(test_text, 'r', 'shift-jis') as f:
    s = f.read()
    node = mecab.parseToNode(s)

origin = []
parts = []
general_noun = []
while node:
    origin.append(node.surface)
    parts.append(node.feature.split(',')[0])
    if node.feature.split(',')[0] == '名詞' and node.feature.split(',')[1] == '一般':
        general_noun.append(node.surface)
    node = node.next

print(origin, parts, sep='\n')

In [0]:
import collections

c = collections.Counter(parts)
g = collections.Counter(general_noun)
print(c, g, sep='\n')