In [57]:
import numpy as np
import pandas as pd
import pymorphy2
import os

In [58]:
import re
import codecs
import string
import math
from nltk.tokenize import TreebankWordTokenizer

In [59]:
morph = pymorphy2.MorphAnalyzer()
tokenizer = TreebankWordTokenizer()

RUS_LETTERS = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
ALL_LETTERS_SET = set(list(RUS_LETTERS))  # Может удалять английские символы? #


def text_to_words(text_text):
    """
    Нормализация текста, преобразование текста твита в склеенные пробелом леммы.
    """
    # 1. Все буквы в нижний регистр
    text_text = text_text.lower()

#    # 2. Удаление всех небукв
#    letters_only = ''
#    for _c in text_text:
#        if _c in ALL_LETTERS_SET:
#            letters_only += _c
#        else:
#            letters_only += ' '

    letters_only = text_text

    # 3. Заменяем множественные пробелы
    while '  ' in letters_only:
        letters_only = letters_only.replace('  ', ' ')

    # 4. Токенизация
    word_list = tokenizer.tokenize(letters_only)
    
    return word_list

In [60]:
def tokenize_and_extract(text, position):
    return text_to_words(text)[position]

In [61]:
def analyze(token):
    p = morph.parse(token)[0]
    pos = p.tag.POS
    nf = p.normal_form
    if 'PNCT' in p.tag:
        pos = 'PNCT'
    return pos, nf

In [62]:
def return_side_index(side):
    '''Возвращает -1, если это левый контекст, и 0, если правый.
    Это число потом используется как индекс, чтобы взять
    последнее слово левого или первое слово правого контекста'''
    if side == 'LT':
        return -1
    elif side == 'RT':
        return 0

In [63]:
def fill_data_with_parsed_tokens(data, side, internalData):
    sideIndex = return_side_index(side)
    for i in data.index:
        text = internalData.loc[i,side+'_con'].lower()
        try:
            token = tokenize_and_extract(text, sideIndex)
            pos, nf = analyze(token)
        except (IndexError):
            pos, nf = None, None
        data.loc[i,[side+'_normal_form', side+'_pos']] = nf, pos

In [64]:
def data_None_to_empty(data):
    data.fillna(value='empty', inplace=True)   

In [65]:
def return_internal_data_frame(df):
    internalData = pd.DataFrame(index=range(len(df)), columns=['LT_con', 'Center', 'RT_con'])
    internalData.LT_con = df.Left_context
    internalData.Center = df.Center
    internalData.RT_con = df.Concated
    return internalData

In [66]:
def process(df):
    df['Concated'] = df.Punct.str.cat(df.Right_context)
    internalData = return_internal_data_frame(df)
    data = pd.DataFrame(index=range(len(df)),
                        columns=['LT_normal_form', 'LT_pos',
                                 'RT_normal_form', 'RT_pos'])
    fill_data_with_parsed_tokens(data, 'LT', internalData)
    fill_data_with_parsed_tokens(data, 'RT', internalData)
    data_None_to_empty(data)
    return data

In [67]:
dataNE = None
for fname in os.listdir('.\\ruscorpora\\ne\\csv'):
    df = pd.read_csv(os.path.join('.\\ruscorpora\\ne\\csv', fname),
                    sep='\t')
    if dataNE is None:
        dataNE = process(df.loc[:99,:])
    else:
        dataNE = dataNE.append(process(df.loc[:99,:]), ignore_index=True)
len(dataNE)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


1000

In [69]:
dataE = None
for fname in os.listdir('.\\ruscorpora\\ne\\csv'):
    df = pd.read_csv(os.path.join('.\\ruscorpora\\ne\\csv', fname),
                    sep='\t')
    if dataE is None:
        dataE = process(df.loc[:99,:])
    else:
        dataE = dataE.append(process(df.loc[:99,:]), ignore_index=True)
len(dataE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


1000

In [70]:
dataNE['class'] = 'Not_eval'
dataE['class'] = 'Eval'
frames = [dataNE, dataE]
result = pd.concat(frames)

In [71]:
result.to_csv('result.csv', sep='\t', index=False)