In [1]:
import numpy as np
import pandas as pd
import xmltodict
import glob
import re
from os.path import basename, splitext
import pickle

import sys
sys.path.append('../')

from labov import datasets, classifier, evaluation
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

  return f(*args, **kwds)


In [2]:
def read_data(files):
    df = {}
    for genre_file in files:
        with open(genre_file) as f:
            tmp = f.read().split('</doc>')
            name = splitext(basename(genre_file))[0]
            df[name] = tmp
            del tmp
    tmp = []
    for name, corpus in df.items():
        i = 0
        for item in corpus:
            m = re.search('<doc id="[0-9]+" genre="[a-zA-Z]+" gender="[M|F|?]">', item)
            if m is not None:
                i += 1
                tmp.append((m.group(0), item))
        assert i==len(corpus)-1
    del df
    df = pd.DataFrame(tmp)

    df.rename(columns={0:'meta', 1:'text'}, inplace=True)
    df['gender'] = df.iloc[:,0].str.extract(r'(M|F)')
    df['genre'] = df.iloc[:,0].str.extract(r'(twitter|youtube|diary|journalism|children)')
    df['docid'] = df.iloc[:,0].str.extract(r'([0-9]+)')
    df.sample(5)
    
    for idx, row in df.iterrows():
        row.text = row.text.replace(row.meta, '').strip()
    
    df.to_json('../../Data/gxg.json', orient='records', lines=True)
    pd.read_json('../../Data/gxg.json', lines=True).sample(5)
    return df

def train_model(df):
    x, y = shuffle(df.text, df.gender, random_state=42)
    cls = classifier.ngram.fit(x, y)
    return cls

In [3]:
def run_and_write(train_file, test_file, category, setting):
    train_data = read_data([train_file])
    print(train_data.sample(5))
    test_data = read_data([test_file])
    print(test_data.sample(5))
    
    model = train_model(train_data)
    
    with open("{} {}.pickle".format(category, setting), "wb") as f:
        pickle.dump(model, f)
    
    y_pred = cross_val_predict(model, train_data.text, train_data.gender, cv=10, n_jobs=-1)
    print("done!")
    print(evaluation.run(train_data.gender, y_pred))
    preds = model.predict(test_data.text)
    results = zip(test_data.docid, preds)
    result_string = '\n'.join(['{}\t{}'.format(docid, pred) for docid, pred in results])
    fname = "CapetownMilanoTirana_{}_{}_1".format(setting, category)
    with open(fname, "w") as f:
        f.write(result_string)

In [4]:
%%time
# IN setting
training_data = glob.glob('../../Data/Training/*')
test_data = glob.glob('../../Data/Test/*')
categories = ["CH", "JO", "TW", "YT", "DI"]

for i in range(5):
    train_file = training_data[i]
    test_file = test_data[i]
    category = categories[i]
    print("training on {}, testing on {}, category {}".format(train_file, test_file, category))
    
    run_and_write(train_file, test_file, category, "IN")

training on ../../Data/Training/GxG_Children.txt, testing on ../../Data/Test/GxG_Children.txt, category CH
                                           meta  \
128  <doc id="129" genre="children" gender="F">   
118  <doc id="119" genre="children" gender="F">   
44    <doc id="45" genre="children" gender="M">   
194  <doc id="195" genre="children" gender="M">   
109  <doc id="110" genre="children" gender="F">   

                                                  text gender     genre docid  
128  Cosa rappresenta per te la lettura di un bel l...      F  children   129  
118  Ciao, vorrei dire buongiorno ma sarebbe ipocri...      F  children   119  
44   Il fatto che mi ha colpito di più è quando ho ...      M  children    45  
194  Ognuno entra in quotidiano contatto con altre ...      M  children   195  
109  Il film parla di un giovane irlandese di nome ...      F  children   110  
                                           meta  \
134  <doc id="135" genre="children" gender="?">   
152 

In [5]:
%%time

# OUT setting
training_data = glob.glob('../../Data/Training/*')
test_data = glob.glob('../../Data/Test/*')
categories = ["CH", "JO", "TW", "YT", "DI"]

for i in range(5):
    train_d = training_data[:]
    del train_d[i]
    print(train_d)
    test_file = test_data[i]
    category = categories[i]
    print("training on {}, testing on {}, category {}".format(train_d, test_file, category))
    
    run_and_write(train_file, test_file, category, "OUT")

['../../Data/Training/GxG_Twitter.txt', '../../Data/Training/GxG_Diary.txt', '../../Data/Training/GxG_Journalism.txt', '../../Data/Training/GxG_YouTube.txt']
training on ['../../Data/Training/GxG_Twitter.txt', '../../Data/Training/GxG_Diary.txt', '../../Data/Training/GxG_Journalism.txt', '../../Data/Training/GxG_YouTube.txt'], testing on ../../Data/Test/GxG_Children.txt, category CH
                                            meta  \
3685  <doc id="4953" genre="youtube" gender="M">   
3046   <doc id="820" genre="youtube" gender="M">   
4263  <doc id="6827" genre="youtube" gender="M">   
1573   <doc id="891" genre="youtube" gender="F">   
1109  <doc id="3338" genre="youtube" gender="M">   

                                                   text gender    genre docid  
3685      zoccola perche' non l'hai detto in parlamento      M  youtube  4953  
3046  Vabbè non mi sembra qs granché, già visto e se...      M  youtube   820  
4263                almeno bersani è un po' autocritico      