In [90]:
import os
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [100]:
merged = pd.read_csv("merged_with_gender.tsv", delimiter="\t")
# filter out examples where we failed to infer gender
genders = ['male', 'female']
df_filtered = merged[merged['gender'].isin(genders)]
df_filtered.shape

(53401, 11)

Stolen from stanford people: https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt

In [101]:
stop_words = [w.strip() for w in open("stopwords.txt").readlines()]
stop_words[:10]

['!!', '?!', '??', '!?', '`', '``', "''", '-lrb-', '-rrb-', '-lsb-']

Let's remove gender pronouns since these aren't terribly interesting as predictors

In [118]:
gender_pronouns = ["she", "her", "hers", "his", "he", "guy", "she's", "he's", "commentshe", "commentsshe", "man", "woman"]
stop_words.extend(gender_pronouns)

Merge reviews for individual doctors (we have multiple reviews for each!)

In [105]:
unique_drs = df_filtered["dr-id"].unique()
unique_drs.shape
grouped = df_filtered.groupby('dr-id')
texts, sexes = [], []
for i, dr in grouped:
    cur_text = " ".join(dr['review-text'].values)
    texts.append(cur_text)
    cur_sex = dr["gender"].values[0]
    sexes.append(cur_sex)

In [110]:
len(texts)

16488

How many females v males?

In [109]:
sexes.count("male")/float(len(sexes))

0.7172489082969432

So about 72% male

In [121]:
vectorizer = CountVectorizer(max_features=20000, min_df=5, ngram_range=(1,1), binary=True, stop_words=stop_words)
X = vectorizer.fit_transform(texts)
y = sexes

Note; should probably pull out and pool reviews for physicians -- here we treat individual reviews for the same physician as multiple / independent instances

In [132]:
param_grid = {"C":[.001, .01, .1]}
clf = GridSearchCV(LogisticRegression(), param_grid=param_grid, scoring="f1_macro")

In [133]:
clf.fit(X,y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring='f1_macro', verbose=0)

In [134]:
clf.best_score_

0.52564427165110383

In [135]:
def show_most_informative_features(vectorizer, clf, n=50):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [136]:
show_most_informative_features(vectorizer, clf.best_estimator_, n=100)

	-0.7085	mary           		0.7761	arrogant       
	-0.6321	incorrect      		0.7112	jerk           
	-0.6265	lady           		0.5469	neurologist    
	-0.5557	sweet          		0.5142	retire         
	-0.5241	judgmental     		0.5012	prostate       
	-0.4774	reference      		0.4733	license        
	-0.4650	unknowledgeable		0.4572	valley         
	-0.4568	female         		0.4444	upfront        
	-0.4494	daiza          		0.4431	pass           
	-0.4456	tech           		0.4417	died           
	-0.4364	barber         		0.4364	pregnancies    
	-0.4350	expenses       		0.4364	john           
	-0.4321	inpatient      		0.4242	knee           
	-0.4291	susan          		0.4197	nobody         
	-0.4278	sweetest       		0.4193	numb           
	-0.4256	pushes         		0.4183	accurately     
	-0.4197	seemingly      		0.4149	cocky          
	-0.4181	murray         		0.4145	possibly       
	-0.4173	cold           		0.4045	abruptly       
	-0.4137	argued         		0.4033	nerve          
	-0.4117	lump       