In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle

# Loading data

In [None]:
PATH = 'data/'
e_fname = f"{PATH}employees.tsv"
m_fname = f"{PATH}messages.tsv"
r_fname = f"{PATH}recipients.tsv"
full_fname = f"{PATH}cleaned_emails.csv"

In [None]:
# Load employee data
e_df = pd.read_csv(e_fname, sep='\t', header=None)
e_df.columns = ['employee_id', 'name', 'dept', 'long_dept', 'title', 'gender', 'seniority']
e_df.set_index('employee_id', inplace=True)

In [None]:
# Load message data
m_df = pd.read_csv(m_fname, sep='\t', header=None)
m_df.columns = ['message_id', 'fname', 'unix_time', 'subject', 'sender_employee_id']
m_df.set_index('message_id', inplace=True)

In [None]:
# Load recipient data
r_df = pd.read_csv(r_fname, sep='\t', header=None)
r_df.columns = ['message_id', 'recipient_num', 'recipient_employee_id']
r_df.set_index('message_id', inplace=True)

In [None]:
# Load full dataset
df = pd.read_csv(full_fname)
df.set_index('Unnamed: 0', inplace=True)
df.index.name = 'id'

# Merging data

In [None]:
# Filter down to only matching names
name_set = set(e_df['name'].values)
name_match_msk = df['from_name'].apply(lambda n: str(n) in name_set)
df = df[name_match_msk]

In [None]:
# Filter down to only matching recipients as well
def matching_recipient(recipient):
    """Return True if the recipient is in the name_set."""
    return any(n in str(recipient) for n in name_set)

def fix_recipient_name(recipient):
    """Return True if the recipient is in the name_set."""
    for n in name_set:
        if n in str(recipient):
            return n
    raise ValueError('Missed name')

recipient_match_msk = df['to_name'].apply(matching_recipient)
df = df[recipient_match_msk]

new_recipient_name = df['to_name'].apply(fix_recipient_name)
df['to_name'] = new_recipient_name

In [None]:
# Add in power/gender info for sender and recipient
gender_dict = {n: g for _, (n, g) in e_df[['name', 'gender']].iterrows()}
title_dict = {n: t for _, (n, t) in e_df[['name', 'title']].iterrows()}
seniority_dict = {n: s for _, (n, s) in e_df[['name', 'seniority']].iterrows()}

from_gender = [gender_dict[name] for name in df['from_name']]
to_gender = [gender_dict[name] for name in df['to_name']]
df['from_gender'] = from_gender
df['to_gender'] = to_gender

from_title = [title_dict[name] for name in df['from_name']]
to_title = [title_dict[name] for name in df['to_name']]
df['from_title'] = from_title
df['to_title'] = to_title

from_seniority = [seniority_dict[name] for name in df['from_name']]
to_seniority = [seniority_dict[name] for name in df['to_name']]
df['from_seniority'] = from_seniority
df['to_seniority'] = to_seniority

In [None]:
# Add a gender-to-gender label
gender_label_dict = {'Male-Male': 0,
                     'Male-Female': 1,
                     'Female-Female': 2,
                     'Female-Male': 3}

gender_label = [gender_label_dict[f"{g1}-{g2}"]
                for _, (g1, g2) in df[['from_gender', 'to_gender']].iterrows()]

df['gender_label'] = gender_label

# Predictions

In [None]:
# Basic counts
g_counts = Counter(e_df['gender'])
g_counts

In [None]:
# df counts
from_g_counts = Counter(df['from_gender'])
from_g_counts

In [None]:
to_g_counts = Counter(df['to_gender'])
to_g_counts

In [None]:
# VERY INTERESTING
convo_counts = Counter(df['gender_label'])
convo_counts

In [None]:
def extract_all_text(label):
    """Get all the text for a given label."""
    tmp_df = df[df['gender_label'] == label]
    return '\n'.join(txt for txt in df['raw_text']).split()

In [None]:
all_0_text = extract_all_text(0)
all_1_text = extract_all_text(1)
all_2_text = extract_all_text(2)
all_3_text = extract_all_text(3)

In [None]:
# Looking at gender-job correlation
df[['from_gender', 'from_title', 'to_gender', 'to_title']]

In [None]:
combos = [(g, t) for _, (g, t) in df[['from_gender', 'from_title']].iterrows()] + \
         [(g, t) for _, (g, t) in df[['to_gender', 'to_title']].iterrows()]

In [None]:
j_map = {c: i for i, c in enumerate(set(c[1] for c in combos))}
g_map = {'Male': 0, 'Female': 1}

In [None]:
gj_mat = np.zeros((30, 2), dtype=int)

In [None]:
gender_job = Counter(combos)

In [None]:
for gj, count in gender_job.items():
    index = (j_map[gj[1]], g_map[gj[0]])
    gj_mat[index] = count

In [None]:
print(' '*24, 'Male', ' Female')
for job, i in j_map.items():
    print(job.ljust(24), str(gj_mat[i][0]).rjust(4), str(gj_mat[i][1]).rjust(4))

# Classifying

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

In [None]:
names = [tuple(map(lambda s: s.lower(), n.split())) for n in e_df['name'].values]

In [None]:
def clean_text(text):
    """Replace names in the text."""
    text = ' '.join(text.lower().split())
    for n in names:
        n1 = n[0]
        if len(n) == 2:
            n2 = n[1]
        else:
            if n[2] == 'jr.' or n[2] == 'iii':
                n2 = n[1]
            else:
                n2 = n[2]
        text = text.replace(n1, '<FIRST_NAME>')
        text = text.replace(n2, '<LAST_NAME>')
    return text

In [None]:
text = df['raw_text'].values
text = [clean_text(t) for t in text]

In [None]:
y = df['gender_label'].values

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=0.8, test_size=0.2)

In [None]:
vec = TfidfVectorizer(stop_words='english', max_features=2000)

X_train = vec.fit_transform(text_train)
X_test = vec.transform(text_test)

In [None]:
clf = LogisticRegression()

In [None]:
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
# Inspecting coefficients
coef_dict = {}
for label in range(4):
    coef_dict[label] = {w: clf.coef_[label][i]
                        for w, i in vec.vocabulary_.items()}

coef_df = pd.DataFrame(coef_dict)

In [None]:
coef_df.sort_values(3, ascending=False)

### Now with just job titles

In [None]:
titles_text = [f"{t1} {t2}" for _, (t1, t2) in df[['from_title', 'to_title']].iterrows()]

In [None]:
titles_text_train, titles_text_test, y2_train, y2_test = train_test_split(titles_text, y, train_size=0.8, test_size=0.2)

In [None]:
vec2 = TfidfVectorizer(stop_words='english')

X2_train = vec2.fit_transform(titles_text_train)
X2_test = vec2.transform(titles_text_test)

In [None]:
clf2 = LogisticRegression()
clf2.fit(X2_train, y2_train)

y2_train_pred = clf2.predict(X2_train)
y2_test_pred = clf2.predict(X2_test)

In [None]:
print(classification_report(y2_train, y2_train_pred))

In [None]:
print(classification_report(y2_test, y2_test_pred))

# GYAFC

Predicting formality

In [106]:
PATH = 'data/GYAFC_Corpus/Family_Relationships/'
# PATH = 'data/GYAFC_Corpus/Entertainment_Music/'
f_train_formal = f"{PATH}train/formal"
f_train_informal = f"{PATH}train/informal"
f_test_formal = f"{PATH}test/formal"
f_test_informal = f"{PATH}test/informal"

In [107]:
train_formal = pd.read_csv(f_train_formal, sep='\t', header=None)
train_informal = pd.read_csv(f_train_informal, sep='\t', header=None)
test_formal = pd.read_csv(f_test_formal, sep='\t', header=None)
test_informal = pd.read_csv(f_test_informal, sep='\t', header=None)

train_formal.columns = ['text']
train_informal.columns = ['text']
test_formal.columns = ['text']
test_informal.columns = ['text']

train_formal['label'] = 1
train_informal['label'] = 0
test_formal['label'] = 1
test_informal['label'] = 0

In [108]:
train = pd.concat([train_formal, train_informal]).sample(frac=1)
test = pd.concat([test_formal, test_informal]).sample(frac=1)

In [109]:
# Build the classifier
text_train = train['text'].values
text_test = test['text'].values

y_train = train['label'].values
y_test = test['label'].values

# Vectorize sentences
vec = TfidfVectorizer(stop_words='english', max_features=2000)
X_train = vec.fit_transform(text_train)
X_test = vec.transform(text_test)

# Build the classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [110]:
print("~~TRAIN~~")
print(classification_report(y_train, y_train_pred))
print("\n~~TEST~~")
print(classification_report(y_test, y_test_pred))

~~TRAIN~~
             precision    recall  f1-score   support

          0       0.74      0.69      0.71     51967
          1       0.71      0.76      0.73     51967

avg / total       0.72      0.72      0.72    103934


~~TEST~~
             precision    recall  f1-score   support

          0       0.83      0.70      0.76      1332
          1       0.67      0.81      0.73      1019

avg / total       0.76      0.75      0.75      2351



In [111]:
clf.coef_.shape

(1, 2000)

In [112]:
# Inspecting coefficients
coef_dict = {}
coef_dict[1] = {w: clf.coef_[0][i]
                    for w, i in vec.vocabulary_.items()}

coef_df = pd.DataFrame(coef_dict)

In [115]:
coef_df.sort_values(by=1).head()

Unnamed: 0,1
dont,-9.621153
ur,-8.688117
thats,-6.921799
im,-6.437525
lol,-5.62868


In [116]:
coef_df.sort_values(by=1, ascending=False).head()

Unnamed: 0,1
inform,3.968602
aware,3.741387
significant,3.679125
attempt,3.635706
correct,3.43674


# Predicting on Vinod's data

Labeling scheme:

Sender:
    - Female = 0
    - Male = 1
    
Recipients:
    - >66% female: 0
    - 33%< and <66% female: 1
    - <33% female: 2

Combo:
    - F-F: 0
    - F-Mixed: 1
    - F-M: 2
    - M-F: 3
    - M-Mixed: 4
    - M-M: 5

In [42]:
import pickle

In [117]:
f_vinod = "data/vinod/full_data_small.pkl"
with open(f_vinod, 'rb') as f:
    df = pickle.load(f)

In [154]:
# Compute ratio of recipient's genders
def get_ratio(row):
    n_male = 0
    n_female = 0
    total = 0
    for g in row['to_gender'] + row['cc_gender']:
        if g =='M':
            n_male += 1
        elif g == 'F':
            n_female += 1
        total += 1
    
    # Return the result, correct for div zero error
    if n_male == 0 and n_female > 0:
        # All female
        return 1
    elif n_female == 0:
        # All indeterminate
        return 0
    else:
        # Some balance, or all male
        return n_female / n_male

def bin_ratios(ratio):
    if ratio > 2/3:
        return 0
    elif 1/3 < ratio < 2/3:
        return 1
    else:
        return 2
        

In [135]:
df.head()

Unnamed: 0_level_0,main_text,message_type,subject,from_id,to_ids,cc,from_gender,to_gender,cc_gender,from_employee_type,to_employee_type,cc_employee_type,to_power_rels,cc_power_rels,to_power_imds,cc_power_imds
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,So . . . you were looking for a one night stan...,,,29879,[14326],[],F,[M],[],NonEnron,[Core],[],[],[],[],[]
2,Hey there Bill!\n\nI thought I'd drop a quick ...,INITIAL,Hello!,86690,[14326],[],F,[M],[],NonEnron,[Core],[],[],[],[],[]
3,"Group,\nEES and I have not been receiving emai...",INITIAL,EES,14326,"[78305, 30341, 2983, 687, 46676, 70629, 106450...",[],M,"[F, M, M, M, M, M, M, M, M, M, M, F, M]",[],Core,"[Core, NonCore, Core, NonCore, Core, NonCore, ...",[],[],[],[],[]
4,"That is so rad Bill. I'm pretty jealous, but a...",RE,woohoo,3487,[14326],[],F,[M],[],NonCore,[Core],[],[],[],[],[]
5,Group.\nWe are short 25 mws under ST-WBOM in N...,INITIAL,Short for 05/04-05/05,14326,"[78305, 2983, 687, 46676, 70629, 106450, 6971,...",[],M,"[F, M, M, M, M, M, M, M, M, M, F, M]",[],Core,"[Core, Core, NonCore, Core, NonCore, Core, Cor...",[],[],[],[],[]


In [155]:
ratios = df.apply(get_ratio, axis=1)

In [156]:
ratio_bins = ratios.apply(bin_ratios)

In [157]:
Counter(ratio_bins)

Counter({0: 111578, 1: 17405, 2: 104439})

In [160]:
df['recipient_gender_ratio'] = ratios
df['recipient_gender_ratio_label'] = ratio_bins
df['gender_label'] = [0 if g == 'F' else 1 for g in df2['from_gender']]

In [161]:
df.head(1)

Unnamed: 0_level_0,main_text,message_type,subject,from_id,to_ids,cc,from_gender,to_gender,cc_gender,from_employee_type,to_employee_type,cc_employee_type,to_power_rels,cc_power_rels,to_power_imds,cc_power_imds,recipient_gender_ratio,recipient_gender_ratio_label,gender_label
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,So . . . you were looking for a one night stan...,,,29879,[14326],[],F,[M],[],NonEnron,[Core],[],[],[],[],[],0.0,2,0


In [162]:
# Now build a combo label of m/f sender and m/mixed/f recipients
def full_gender_label(row):
    if row['gender_label']:  # Male
        return 3 + row['recipient_gender_ratio_label']
    else:  # Female
        return row['recipient_gender_ratio_label']

In [163]:
df['full_labels'] = df.apply(full_gender_label, axis=1)

In [167]:
Counter(df['full_labels'])

Counter({0: 63194, 1: 8184, 2: 29979, 3: 48384, 4: 9221, 5: 74460})

### Classification

In [118]:
# Class balance
counts = Counter(df['from_gender'])
counts

Counter({'F': 101357, 'I': 22950, 'M': 132065})

In [119]:
# Filter out indeterminates
df = df[df['from_gender'] != 'I']

In [120]:
# Predict formality
text = df['main_text'].values
gender_label = df['from_gender'].values

In [121]:
X = vec.transform(text)

In [122]:
y_enron_pred = clf.predict(X)

In [168]:
df['formality'] = y_enron_pred

In [186]:
# Look at the correlation between formality and gender environment
cm = confusion_matrix(df['full_labels'].values,
                      df2['formality'].values)
cm = cm * 1.
cm = cm[:, :2]
cm = cm/cm.sum(axis=1)[:,None]

In [187]:
cm

array([[0.25034022, 0.74965978],
       [0.15982405, 0.84017595],
       [0.24443777, 0.75556223],
       [0.23906663, 0.76093337],
       [0.18544626, 0.81455374],
       [0.23424658, 0.76575342]])