In [34]:
import numpy as np
import pandas as pd
from collections import Counter
import pickle
import string
import matplotlib.pyplot as plt
%matplotlib inline
import re

In [None]:
f_regex = re.compile('([^a-z]|^|$)f([^a-z]|^|$)')
m_regex = re.compile('([^a-z]|^|$)m([^a-z]|^|$)')
# num_regex = re.compile('[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?')
num_regex = re.compile('[0-9]+')

In [None]:
def extract_gender(title):
    """returns a tuple: (gender, index of the title that contributed most to the decision)"""
    f_matches = [m.start()+1 for m in f_regex.finditer(title)]
    m_matches = [m.start()+1 for m in m_regex.finditer(title)]
    if (len(f_matches) > 0 and len(m_matches) > 0) or len(m_matches) > 1 or len(f_matches) > 1:
        return ('unicorn', 0)
    if len(f_matches) > 0:
        return ('female', f_matches[0])
    if len(m_matches) > 0:
        return ('male', m_matches[0])
    if title.find('female') != -1:
        return ('female', title.find('female'))
    if title.find('male') != -1:
        return ('male', title.find('male'))
    return ('unknown', 0)

In [182]:
def extract_age(title, gender_res):
    gender, gender_pos = gender_res
    age_list = [[int(title[m.start() : m.end()]), m.start()] for m in num_regex.finditer(title)]
    age_min = 18
    age_max = 80
    filtered = [item for item in age_list if (item[0] >= age_min and item[0] <= age_max)]
    if not filtered:
#         age = 0
        age = np.nan
    else:
#         age = 0
        age = np.nan
        min_dist = -1
        for num, num_index in age_list:
            if num_index > gender_pos:
                cur_dist = num_index - gender_pos
            else:
                cur_dist = gender_pos - num_index + 1
            if min_dist == -1 or cur_dist < min_dist:
                age = num
                min_dist = cur_dist
    return age

In [234]:
posts = pickle.load(open('data/posts_df.dat', 'rb'))
posts.index = posts.created
posts_b = posts.copy()

In [235]:
posts.title = posts['title'].str.lower()
posts.title = posts['title'].str.replace('\'', '')

In [236]:
gender_res = posts['title'].apply(lambda x: extract_gender(x))
posts['gender'] = gender_res.apply(lambda x: x[0])
posts = posts[posts.gender != 'unicorn']

In [237]:
posts['age'] = posts.apply(lambda row: extract_age(row['title'], extract_gender(row['title'])), axis=1)
# posts.age = posts.apply(lambda row: extract_age(row['title'], gender_res.loc[row.created]), axis=1)

### check manually that age is below 100 at this point

In [None]:
# # debug
# data = new_posts.sample(100).title.values
# for title in data:
#     gender_res = extract_gender(title)
#     extract_age(title, gender_res)

In [239]:
posts.sample(100)

Unnamed: 0_level_0,author,author_has_flair,created,edited,gilded,height,id,is_self,over_18,permalink,score,selftext,thumbnail_height,thumbnail_width,title,width,gender,age
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-12-15 05:23:54,Yoursernamedo_esntfi,False,2015-12-15 05:23:54,False,False,,3wwfoi,False,False,/r/Rateme/comments/3wwfoi/19m_been_isolated_si...,1,,,,[19m] been isolated since a recent move starti...,,male,19
2014-01-19 14:08:43,MySpareAcct,False,2014-01-19 14:08:43,False,False,550,1vlcyc,False,False,/r/Rateme/comments/1vlcyc/23m_im_curious_whatt...,0,,594,261,[23/m] im curious. whatta you think?,550,male,23
2015-08-23 18:06:10,dope_pete,False,2015-08-23 18:06:10,False,False,,3i3pcw,False,False,/r/Rateme/comments/3i3pcw/m20_rate_me_people/,0,,,,[m20] rate me people,,male,20
2015-05-24 21:57:04,Farshad040,False,2015-05-24 21:57:04,False,False,,374trz,True,False,/r/Rateme/comments/374trz/rate_m19_curious/,0,hey heres me rate me and be honest http://imgu...,,,rate (m19) curious!,,male,19
2014-05-22 15:26:59,iAmSane,False,2014-05-22 15:26:59,True,False,,267s0a,True,False,/r/Rateme/comments/267s0a/m_18_improvements_wo...,2,So i've lost some weight and would like Reddit...,,,m 18 improvements would be nice!,,male,18
2017-09-11 13:55:14,Sunny2456,False,2017-09-11 13:55:14,False,False,550,6zfj59,False,False,/r/Rateme/comments/6zfj59/m21_i_always_wanted_...,3,,315,600,[m21] i always wanted to know - glasses or no ...,550,male,21
2013-06-08 19:59:53,number1q,False,2013-06-08 19:59:53,False,False,,1fxxin,True,False,/r/Rateme/comments/1fxxin/bored_rate_me_m_22/,0,http://oi40.tinypic.com/ta4yt3.jpg\nhttp://oi4...,,,"bored, rate me [m, 22]",,male,22
2016-12-20 10:57:52,My___Legacy,False,2016-12-20 10:57:52,False,False,550,5jcow4,False,False,/r/Rateme/comments/5jcow4/m18_so_am_i_ugly_or_...,3,,315,600,[m18] so am i ugly or what. never considered m...,550,male,18
2017-11-21 19:26:42,Pi_Arc,False,2017-11-21 19:26:42,False,False,550,7ekfgk,False,False,/r/Rateme/comments/7ekfgk/my_boyfriend_19m_wan...,0,,315,600,my boyfriend [19m] wants your opinion.,550,male,19
2015-03-01 09:49:35,helpfull,False,2015-03-01 09:49:35,False,False,,2xjsrz,True,False,/r/Rateme/comments/2xjsrz/24m_kinda_shoddy_con...,1,Like the title says I'm kinda hoping for an ho...,,,(24m) kinda shoddy confidence. please give me ...,,male,24


In [241]:
pickle.dump(posts, open('data/posts_with_age_gender.dat', 'wb'))

In [178]:
# for title in data:
#     print(title)
#     gender_res = extract_gender(title)
#     print(gender_res)
#     print(extract_age(title, gender_res))
#     print()

In [179]:
# posts.resample('D').mean().is_self.plot(figsize=(20,10))

In [180]:
# posts.resample('D').mean().score.plot(figsize=(20,10))

In [6]:
# ser = pd.Series(1, index=dates)
# ser.resample('D').sum()

In [181]:
# print(posts[~posts.over_18].score.mean())
# print(posts[posts.over_18].score.mean())