In [2]:
import numpy as np
import pandas as pd
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import pymorphy2
from nltk.stem import WordNetLemmatizer

In [3]:
def prepare_dfs(gen_file_path, in_file_path, feature):
    print "Load to DataFrame gen_file_path"
    gen_df = pd.read_csv(gen_file_path,
                         encoding='utf-8',
                         sep='\t',
                         skipinitialspace=True,
                         usecols=['gender', 'age', 'uid']
                         )
    gen_train_df = gen_df[~((gen_df['gender'] == '-') & (gen_df['age'] == '-'))]
    gen_test_df = gen_df[(gen_df['gender'] == '-') & (gen_df['age'] == '-')]
    gen_test_df_uids = gen_test_df['uid'].unique()

    print "Load to DataFrame meta data"
    meta_df = pd.read_csv(in_file_path,
                          encoding='utf-8',
                          sep='\t',
                          skipinitialspace=True,
                          )

    print "Preparing Train DataFrame"
    meta_train_df = pd.merge(meta_df, gen_train_df, on='uid', sort=False)
    meta_train_series = meta_train_df.groupby(feature)['meta']\
        .apply(lambda x: u' '.join([unicode(ss).replace('&nbsp', ' ').replace('&quot', '"')
                                   .replace('&laquo', '"').replace('&raquo', '"') for ss in x.tolist()]))
    meta_train_df = pd.DataFrame(meta_train_series, index=meta_train_series.index, columns=['meta'])

    print "Preparing Test DataFrame"
    meta_test_df = meta_df[meta_df['uid'].isin(gen_test_df_uids.tolist())]
    meta_test_series = meta_test_df.groupby('uid')['meta']\
        .apply(lambda x: u' '.join([unicode(ss).replace('&nbsp', ' ').replace('&quot', '"')
                                   .replace('&laquo', '"').replace('&raquo', '"') for ss in x.tolist()]))
    meta_test_df = pd.DataFrame(meta_test_series, index=meta_test_series.index, columns=['meta'])

    return gen_train_df, gen_test_df, meta_train_df, meta_test_df

In [4]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.morph = pymorphy2.MorphAnalyzer()

    def __call__(self, text):
        tokenz = TfidfVectorizer().build_tokenizer()(text)
        lemmas = []
        for t in tokenz:
            if len(t) > 2:
                p = self.morph.parse(t)
                if 'LATN' in p[0].tag: # and re.search('!\d+', p[0].normal_form)
                    lemmas.append(self.wnl.lemmatize(t))
                elif 'NUMB' in p[0].tag:
                    continue
                elif 'UNKN' in p[0].tag:
                    continue
                elif 'ROMN' in p[0].tag:
                    continue
                else:
                    lemmas.append(p[0].normal_form)
        return lemmas

In [5]:
feature = 'age'
print '_' * 80
print "Cooking data for feature '%s'" % feature
t0 = time()

gen_file_path = '/Users/usual/PycharmProjects/npl_project01/data/gender_age_dataset.txt'
in_file_path = '/Users/usual/PycharmProjects/npl_project01/data/csv/uid_meta_fixed.csv'

gen_train_df, gen_test_df, meta_train_df, meta_test_df = prepare_dfs(gen_file_path, in_file_path, feature)

command_time = time() - t0
print "cooking data time: %0.3fs" % command_time
# Print out prepared data info
#     print u'len(gen_train_df) = %d\nlen(gen_test_df) = %d\nlen(meta_train_df) = %d\n' \
#           u'len(meta_test_df) = %d\nlen(tmeta_train_df) = %d\nlen(tmeta_test_df) = %d' % \
#           (len(gen_train_df),
#            len(gen_test_df),
#            len(meta_train_df),
#            len(meta_test_df),
#            len(tmeta_train_df),
#            len(tmeta_test_df))

# Vectorizing
print '_' * 80
print "Vectorizing"
print "Train"
t0 = time()

# uids_train = tmeta_train_df['uid'].tolist()
y_train = meta_train_df.index.values
corpus_train = meta_train_df['meta'].tolist()
stop_words = stopwords.words('english') + stopwords.words('russian') + ['ru', 'com']
vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words=stop_words)  # ngram_range=(1, 2)
x_train = vectorizer.fit_transform(corpus_train)
feature_names = np.array(vectorizer.get_feature_names())

command_time = time() - t0
print "vectorizing train time: %0.3fs" % command_time

# print x_train.shape
# print x_train.toarray()
#
# ch2 = SelectKBest(chi2, k=10)
# x_train = ch2.fit_transform(x_train, y_train)

print "Test"
t0 = time()
# uids_test = tmeta_test_df['uid'].tolist()
y_test = meta_test_df.index.values
corpus_test = meta_test_df['meta'].tolist()
x_test = vectorizer.transform(corpus_test)

command_time = time() - t0
print "vectorizing test time: %0.3fs" % command_time
# x_test = ch2.transform(x_test)

# Print out x_train
print x_train.shape
print x_train.toarray()

print x_test.shape
print x_test.toarray()

________________________________________________________________________________
Cooking data for feature 'age'
Load to DataFrame gen_file_path


Load to DataFrame meta data


Preparing Train DataFrame


Preparing Test DataFrame


cooking data time: 15.870s
________________________________________________________________________________
Vectorizing
Train
