In [4]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

# GYAFC

Predicting formality

In [5]:
PATH = 'data/GYAFC_Corpus/Family_Relationships/'
# PATH = 'data/GYAFC_Corpus/Entertainment_Music/'
f_train_formal = f"{PATH}train/formal"
f_train_informal = f"{PATH}train/informal"
f_test_formal = f"{PATH}test/formal"
f_test_informal = f"{PATH}test/informal"

In [6]:
train_formal = pd.read_csv(f_train_formal, sep='\t', header=None)
train_informal = pd.read_csv(f_train_informal, sep='\t', header=None)
test_formal = pd.read_csv(f_test_formal, sep='\t', header=None)
test_informal = pd.read_csv(f_test_informal, sep='\t', header=None)

train_formal.columns = ['text']
train_informal.columns = ['text']
test_formal.columns = ['text']
test_informal.columns = ['text']

train_formal['label'] = 1
train_informal['label'] = 0
test_formal['label'] = 1
test_informal['label'] = 0

In [7]:
train = pd.concat([train_formal, train_informal]).sample(frac=1)
test = pd.concat([test_formal, test_informal]).sample(frac=1)

In [10]:
# Build the classifier
text_train = train['text'].values
text_test = test['text'].values

y_train = train['label'].values
y_test = test['label'].values

# Vectorize sentences
vec = TfidfVectorizer(stop_words='english', max_features=2000)
X_train = vec.fit_transform(text_train)
X_test = vec.transform(text_test)

# Build the classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [11]:
print("~~TRAIN~~")
print(classification_report(y_train, y_train_pred))
print("\n~~TEST~~")
print(classification_report(y_test, y_test_pred))

~~TRAIN~~
             precision    recall  f1-score   support

          0       0.74      0.69      0.71     51967
          1       0.71      0.76      0.73     51967

avg / total       0.72      0.72      0.72    103934


~~TEST~~
             precision    recall  f1-score   support

          0       0.83      0.70      0.76      1332
          1       0.67      0.81      0.73      1019

avg / total       0.76      0.75      0.75      2351



In [12]:
clf.coef_.shape

(1, 2000)

In [13]:
# Inspecting coefficients
coef_dict = {}
coef_dict[1] = {w: clf.coef_[0][i]
                    for w, i in vec.vocabulary_.items()}

coef_df = pd.DataFrame(coef_dict)

In [14]:
coef_df.sort_values(by=1).head()

Unnamed: 0,1
dont,-9.621634
ur,-8.690802
thats,-6.923826
im,-6.438043
lol,-5.629524


In [15]:
coef_df.sort_values(by=1, ascending=False).head()

Unnamed: 0,1
inform,3.968678
aware,3.741648
significant,3.67926
attempt,3.635953
correct,3.436881


# Predicting on Vinod's data

Labeling scheme:

Sender:
    - Female = 0
    - Male = 1
    
Recipients:
    - >66% female: 0
    - 33%< and <66% female: 1
    - <33% female: 2

Combo:
    - F-F: 0
    - F-Mixed: 1
    - F-M: 2
    - M-F: 3
    - M-Mixed: 4
    - M-M: 5

In [16]:
import pickle

In [17]:
f_vinod = "data/vinod/full_data_small.pkl"
with open(f_vinod, 'rb') as f:
    df = pickle.load(f)

In [18]:
# Compute ratio of recipient's genders
def get_ratio(row):
    n_male = 0
    n_female = 0
    total = 0
    for g in row['to_gender'] + row['cc_gender']:
        if g =='M':
            n_male += 1
        elif g == 'F':
            n_female += 1
        total += 1
    
    # Return the result, correct for div zero error
    if n_male == 0 and n_female > 0:
        # All female
        return 1
    elif n_female == 0:
        # All indeterminate
        return 0
    else:
        # Some balance, or all male
        return n_female / n_male

def bin_ratios(ratio):
    if ratio > 2/3:
        return 0
    elif 1/3 < ratio < 2/3:
        return 1
    else:
        return 2
        

In [19]:
df.head()

Unnamed: 0_level_0,main_text,message_type,subject,from_id,to_ids,cc,from_gender,to_gender,cc_gender,from_employee_type,to_employee_type,cc_employee_type,to_power_rels,cc_power_rels,to_power_imds,cc_power_imds
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,So . . . you were looking for a one night stan...,,,29879,[14326],[],F,[M],[],NonEnron,[Core],[],[],[],[],[]
2,Hey there Bill!\n\nI thought I'd drop a quick ...,INITIAL,Hello!,86690,[14326],[],F,[M],[],NonEnron,[Core],[],[],[],[],[]
3,"Group,\nEES and I have not been receiving emai...",INITIAL,EES,14326,"[78305, 30341, 2983, 687, 46676, 70629, 106450...",[],M,"[F, M, M, M, M, M, M, M, M, M, M, F, M]",[],Core,"[Core, NonCore, Core, NonCore, Core, NonCore, ...",[],[],[],[],[]
4,"That is so rad Bill. I'm pretty jealous, but a...",RE,woohoo,3487,[14326],[],F,[M],[],NonCore,[Core],[],[],[],[],[]
5,Group.\nWe are short 25 mws under ST-WBOM in N...,INITIAL,Short for 05/04-05/05,14326,"[78305, 2983, 687, 46676, 70629, 106450, 6971,...",[],M,"[F, M, M, M, M, M, M, M, M, M, F, M]",[],Core,"[Core, Core, NonCore, Core, NonCore, Core, Cor...",[],[],[],[],[]


In [20]:
ratios = df.apply(get_ratio, axis=1)

In [21]:
ratio_bins = ratios.apply(bin_ratios)

In [22]:
Counter(ratio_bins)

Counter({0: 119622, 1: 18304, 2: 118446})

In [23]:
df['recipient_gender_ratio'] = ratios
df['recipient_gender_ratio_label'] = ratio_bins
df['gender_label'] = [0 if g == 'F' else 1 for g in df['from_gender']]

In [24]:
df.head(1)

Unnamed: 0_level_0,main_text,message_type,subject,from_id,to_ids,cc,from_gender,to_gender,cc_gender,from_employee_type,to_employee_type,cc_employee_type,to_power_rels,cc_power_rels,to_power_imds,cc_power_imds,recipient_gender_ratio,recipient_gender_ratio_label,gender_label
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,So . . . you were looking for a one night stan...,,,29879,[14326],[],F,[M],[],NonEnron,[Core],[],[],[],[],[],0.0,2,0


In [25]:
# Now build a combo label of m/f sender and m/mixed/f recipients
def full_gender_label(row):
    if row['gender_label']:  # Male
        return 3 + row['recipient_gender_ratio_label']
    else:  # Female
        return row['recipient_gender_ratio_label']

In [26]:
df['full_labels'] = df.apply(full_gender_label, axis=1)

In [27]:
Counter(df['full_labels'])

Counter({0: 63194, 1: 8184, 2: 29979, 3: 56428, 4: 10120, 5: 88467})

### Classification

In [28]:
# Class balance
counts = Counter(df['from_gender'])
counts

Counter({'F': 101357, 'I': 22950, 'M': 132065})

In [29]:
# Filter out indeterminates
df = df[df['from_gender'] != 'I']

In [30]:
# Predict formality
text = df['main_text'].values
gender_label = df['from_gender'].values

In [31]:
X = vec.transform(text)

In [32]:
y_enron_pred = clf.predict(X)

In [33]:
df['formality'] = y_enron_pred

In [34]:
# Look at the correlation between formality and gender environment
cm = confusion_matrix(df['full_labels'].values,
                      df['formality'].values)
cm = cm * 1.
cm = cm[:, :2]
cm = cm/cm.sum(axis=1)[:,None]

In [35]:
cm

array([[0.25034022, 0.74965978],
       [0.15982405, 0.84017595],
       [0.24443777, 0.75556223],
       [0.23906663, 0.76093337],
       [0.18544626, 0.81455374],
       [0.23420629, 0.76579371]])