In [35]:
from spacy.en import English

In [2]:
import xmltodict
import logging
import glob
import os
import pandas as pd
import sys
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


# Main dataset loading utility
class PanDataLoader:
    
    def __init__(self, logger=None):
        if logger is None:
            logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
            self.log = logging.getLogger(__name__)
        else:
            self.log = logger
                               
    def load_17(self, directory):
            
        """Load and return the pan17 gender and variation twitter dataset.
        ==============                                      ==============
        Samples total                                                10800
        Targets            nominal [{male, female},
                                    {ar, pt, es, en},
                                    {'brazil', 'australia', 'venezuela',
                                     'portugal', 'great britain', 'chile',
                                     'levantine', 'egypt', 'colombia',
                                     'peru', 'ireland', 'argentina',
                                     'maghrebi', 'mexico', 'new zealand',
                                     'spain', 'canada', 'gulf'}]
        ==============                                      ==============
        Parameters
        ----------
        inputdir
        The directory containing the training data, i.e. /data/training.

        Returns
        -------
        data : Pandas dataframe
            The interesting attributes are:
            'text', the data to learn, ['gender','lang', variety],
            the regression targets,
        Examples
        --------
        >>> from datasets import load_pan17
        >>> df_training = load_pan17(inputdir)
        >>> print(df_training.corpus.shape)
        (10800, 5)
        """

        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        # check that the dataset is loaded correctly

        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                doc = xmltodict.parse(f.read())
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = doc['author']['documents']['document']
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'variety'],
                             engine='python')

        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    def load_16(self, directory):
        return self.load_14(directory)
    
    def load_15(self, directory):
        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                doc = xmltodict.parse(f.read())
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = doc['author']['document']
            # print(author, lang, text[:100])
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'age', '1','2','3','4', '5'],
                             engine='python') 


        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    
    def load_14(self, directory):
        errors = 0
        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                try:
                    doc = xmltodict.parse(f.read())
                except Exception as e:
                    self.log.warning(e)
                    self.log.warning("Skipping: {}".format(t))
                    continue
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = []
            for td in doc['author']['documents']['document']:
                try:
                    t = BeautifulSoup(td['#text'], "lxml").getText()
                    text.append(t)
                except Exception as e:
                    errors += 1
                    # log.warning(e)
                    # self.log.warning("skipping {}".format(td))
                    continue
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'age'],
                             engine='python') 

        self.log.warning("Skipped {}".format(errors))

        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    def _load_all(self, loader_func, directories):
        """Concatenate across languages"""
        corpora = []
        for dr in directories:
            corpus = loader_func(dr)
            corpora.append(corpus)
        return pd.concat(corpora)
    
    def load_all_17(self, directories):
        return self._load_all(self.load_17, directories)
    
    def load_all_16(self, directories):
        return self._load_all(self.load_16, directories)
    
    def load_all_15(self, directories):
        return self._load_all(self.load_15, directories)
    
    def load_all_14(self, directories):
        return self._load_all(self.load_14, directories)
    
    def clean_and_normalize(self, corpus):
        """Standardize to lowercase for gender and langauge, m/f for gender
           Remove personality scores"""
        # FIXME TODO -- how do you do this in place?
        # FIXME TODO -- normalize age ranges?
        corpus['gender'] = corpus['gender'].apply(lambda s: s[0].lower())
        corpus['lang'] = corpus['lang'].apply(lambda s: s.lower())

        for c in ['1', '2', '3', '4', '5']:
            if c in corpus:
                del corpus[c]
        return corpus


In [5]:
pan17_dir = "/root/pan17-author-profiling-training-dataset-2017-03-10/"

pdl = PanDataLoader()
corpus = pdl.load_17("/root/pan17-author-profiling-training-dataset-2017-03-10/en")

In [7]:
corpus['text'] = corpus['text'].apply(lambda x: "\n".join(x))

In [16]:
corpus

Unnamed: 0,author,lang,text,gender,variety
0,a44dc279880378a895a6cbaabf927a5,en,#BornThisWay #cringe #notsorry #proud #LoveTru...,male,ireland
1,cfe998a7157a4eebf32a4f5d4f66cd0a,en,Developing movement: a march of scientists on ...,male,united states
2,3f4181a9b86a6c3c7e552e8015724e36,en,"Cate Blanchett, Peter Capaldi, Kit Harington, ...",male,united states
3,e58540ad4a155d1fe48b87fa8cae2d9f,en,Problems You're Having With Your #SalesFunnel ...,male,canada
4,dfa2664285e6cd59926c5017ff814877,en,@thewrongdonna Yay! My kid is almost 7m and I ...,female,ireland
5,d3383f12e6f5af0707e71da5eb99bc2,en,Can't wait for Belfast on Saturday 💁🏼😅 excited...,female,ireland
6,d7bee78f1974ed2d72dbfc4b66d9f445,en,@RTE_GUIDE @icarustheatre @BGETheatre #win To ...,female,ireland
7,57bfc594b7942f4e4e6083baacdd197b,en,@Greeneil83 decent game this!\n@albioncoaching...,male,new zealand
8,f988ad1bfcbca96341737a17c22cdba1,en,All the usual suspects! \n#EatRealFood https:/...,male,ireland
9,ec07cbdbb291b13fc47376c38c1b62ac,en,@bogdanoviclab is a terrific mid career scient...,male,australia


In [12]:
from spacy.en import English
nlp = English()
processed_texts = [nlp(text) for text in corpus['text']]

2017-04-19 14:44:24,445 - pip.vcs - DEBUG - Registered VCS backend: git
2017-04-19 14:44:24,461 - pip.vcs - DEBUG - Registered VCS backend: hg
2017-04-19 14:44:24,490 - pip.vcs - DEBUG - Registered VCS backend: svn
2017-04-19 14:44:24,492 - pip.vcs - DEBUG - Registered VCS backend: bzr


In [13]:
gender_labels = [0 if g == 'female' else 1 for g in corpus['gender']]
gender_labels[:10]

[1, 1, 1, 1, 0, 0, 0, 1, 1, 1]

In [14]:
vs = set(list(corpus['variety']))
var_to_idx = {v: i for i,v in enumerate(vs)}
idx_to_var = {i: v for i,v in enumerate(vs)}
var_labels = [var_to_idx[v] for v in corpus['variety']]

In [15]:
var_labels[:11]

[0, 5, 5, 1, 0, 0, 0, 4, 0, 2, 4]

In [22]:
import numpy as np
Xs = np.array([x.vector for x in processed_texts])
ys = gender_labels
len(ys)

3600

In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from statistics import mean

svm = LinearSVC()

gender_score = cross_val_score(svm, Xs, gender_labels, cv=5)
print(gender_score)
print(mean(gender_score))


[ 0.75833333  0.775       0.825       0.80138889  0.79583333]
0.791111111111


In [28]:
var_score = cross_val_score(svm, Xs, var_labels, cv=5)
print(var_score)
print(mean(var_score))

In [61]:
def pad(lst, target_length, pad_char=0):
    if len(lst) == target_length:
        return lst
    if len(lst) < target_length:
        diff = target_length - len(lst)
        padding = [pad_char] * diff
        return lst + padding
    else:
        return lst[:target_length]

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer(ngram_range=(1,5))

In [56]:
def get_str_tags(spacy_doc):
    return " ".join([word.tag_ for word in spacy_doc])

stags = [get_str_tags(text) for text in processed_texts]

In [57]:
tagXs = word_vectorizer.fit_transform(stags)

In [60]:
tag_tfidf_score = cross_val_score(svm, tagXs, gender_labels, cv=5)
print(tag_tfidf_score)
print(mean(tag_tfidf_score))

[ 0.70138889  0.70694444  0.74861111  0.70138889  0.70972222]
0.713611111111


In [62]:
unigram_vectorizer = TfidfVectorizer()
unigramXs = unigram_vectorizer.fit_transform(corpus['text'])

In [63]:
unigramXs.shape

(3600, 378307)

In [65]:
uni_score = cross_val_score(svm, unigramXs, gender_labels, cv=5)
print(uni_score)
print(mean(uni_score))

[ 0.79166667  0.78472222  0.8         0.80833333  0.80416667]
0.797777777778


In [68]:
# sanity check with shuffled labels
from random import shuffle
sh_gender_labels = gender_labels[:]
shuffle(sh_gender_labels)
print(sh_gender_labels[:10], gender_labels[:10])
uni_score = cross_val_score(svm, unigramXs, sh_gender_labels, cv=5)
print(uni_score)
print(mean(uni_score))

[1, 0, 1, 0, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 0, 0, 0, 1, 1, 1]
[ 0.52222222  0.50138889  0.5         0.50277778  0.48472222]
0.502222222222


In [139]:
char_vectorizer = TfidfVectorizer(ngram_range=(1,7), analyzer='char')
charXs = char_vectorizer.fit_transform(corpus['text'])
char_score = cross_val_score(svm, charXs, gender_labels, cv=5)
print(char_score)
print(mean(char_score))

[ 0.76666667  0.76805556  0.80138889  0.79722222  0.79305556]
0.785277777778


In [142]:
from sklearn.pipeline import FeatureUnion

char_word_vectorizer = FeatureUnion([
    ('word', unigram_vectorizer),
    ('char', char_vectorizer)
])

cwXs = char_word_vectorizer.fit_transform(corpus['text'])

In [143]:
cw_score = cross_val_score(svm, cwXs, gender_labels, cv=5)
print(cw_score)
print(mean(cw_score))

[ 0.80138889  0.7875      0.80972222  0.82222222  0.81944444]
0.808055555556


In [70]:
import textacy

2017-04-19 15:33:27,630 - root - INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/Grammar.txt
2017-04-19 15:33:27,662 - root - INFO - Generating grammar tables from /usr/lib/python3.5/lib2to3/PatternGrammar.txt


In [71]:
from textacy.text_stats import TextStats

In [72]:
stats = TextStats(processed_texts[0])

2017-04-19 15:34:50,960 - textacy.data - INFO - Loading "en" language hyphenator


In [94]:
def get_stats(spacy_doc):
    stats = TextStats(spacy_doc)
    '''
            >>> ts.basic_counts
        {'n_chars': 685,
         'n_long_words': 43,
         'n_monosyllable_words': 90,
         'n_polysyllable_words': 24,
         'n_sents': 6,
         'n_syllables': 214,
         'n_unique_words': 80,
         'n_words': 136}
        >>> ts.readability_stats
        {'automated_readability_index': 13.626495098039214,
         'coleman_liau_index': 12.509300816176474,
         'flesch_kincaid_grade_level': 11.817647058823532,
         'flesch_readability_ease': 50.707745098039254,
         'gulpease_index': 51.86764705882353,
         'gunning_fog_index': 16.12549019607843,
         'lix': 54.28431372549019,
         'smog_index': 14.554592549557764,
         'wiener_sachtextformel': 8.266410784313727}'''
    return stats.n_unique_words

statsXs = [get_stats(text) for text in processed_texts]








In [128]:
svma = LinearSVC()
stats_score = cross_val_score(svm, statsXs, gender_labels, cv=5)
print(stats_score)
print(mean(stats_score))

[ 0.49722222  0.49861111  0.49861111  0.5         0.5       ]
0.498888888889


In [113]:
mstats = [x for i,x in enumerate(statsXs) if gender_labels[i] == 1]
fstats = [x for i,x in enumerate(statsXs) if gender_labels[i] == 0]

In [133]:
for i in range(len(fstats[0])):
    avg_female = mean([x[i] for x in fstats])
    avg_male = mean([x[i] for x in mstats])
    diff = avg_female - avg_male
    perc = diff/(mean([avg_female, avg_male]))
    print("{}\t {}\t {}\t {}".format(diff, abs(perc), avg_male, avg_female))

-146.64499999999953	 0.018993109267635803	 7794.280555555555	 7647.635555555556
-3.9849999999999994	 0.03555011485764838	 114.08777777777777	 110.10277777777777
-4.436111111111131	 0.004502265651716365	 987.5244444444445	 983.0883333333334
-21.204999999999927	 0.014820319784829826	 1441.4083333333333	 1420.2033333333334
-13.353333333333296	 0.03711911442898234	 366.4194444444444	 353.0661111111111
-40.35944444444476	 0.01843120266343794	 2209.9144444444446	 2169.555
-5.150000000000006	 0.026919268330221333	 193.88777777777779	 188.73777777777778
-27.392777777777837	 0.040724058513733424	 686.34	 658.9472222222222
0.7098574304387295	 0.053839555769243264	 12.829753444514338	 13.539610874953068
0.14774930274732956	 0.025477536372085902	 5.725324320988866	 5.873073623736196
-0.10305867392933443	 0.0074493728034612265	 13.886073082909284	 13.78301440897995
1.1122255134232617	 0.02561896397668125	 42.85803601353627	 43.97026152695953
0.12949129878162857	 0.01146855613036804	 11.226239659452

In [145]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(unigramXs, gender_labels, test_size=0.5)

unigram_svm = LinearSVC()
unigram_svm.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [146]:
preds = unigram_svm.predict(X_test)

In [149]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

0.78666666666666663

In [150]:
new_corpus = pdl.load_17("/root/pan17-author-profiling-training-dataset-2017-03-10/en")

In [151]:
new_corpus = new_corpus[new_corpus['lang'] == 'en']

In [162]:
all_tweet_texts = []
all_tweet_gender = []
tweet_user_lookup = {}

for i in range(len(new_corpus)):
    user = new_corpus.iloc[i]
    gender = 0 if user.gender == "female" else 1
    labels = [gender] * len(user.text)
    all_tweet_texts += user.text
    all_tweet_gender += labels


In [189]:
n_gender_labels = [0 if g == 'female' else 1 for g in new_corpus['gender'] ]

In [166]:
unigramByTweetXs = unigram_vectorizer.fit_transform(all_tweet_texts)

In [167]:
uniBTscore = cross_val_score(svm, unigramByTweetXs, all_tweet_gender, cv=5)

In [171]:
from sklearn.model_selection import cross_val_predict
byTweetPreds = cross_val_predict(svm, unigramByTweetXs, all_tweet_gender, cv=5)

In [180]:
from collections import Counter
Counter(byTweetPreds[0:100]).most_common()[0][0]

0

In [182]:
def chunks(l, n):
    # http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [185]:
chunked_preds = list(chunks(byTweetPreds, 100))

In [187]:
maj_predictions = [Counter(c).most_common()[0][0] for c in chunked_preds]

In [190]:
accuracy_score(maj_predictions, n_gender_labels)

0.78027777777777774

In [None]:
from sklearn.svm import SVC
prob_svm = SVC(kernel='linear', probability=True)
proba = cross_val_predict(prob_svm, unigramByTweetXs, all_tweet_gender, cv=5, method='predict_proba', njobs=-1)