In [7]:
import os
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [8]:
merged = pd.read_csv("merged_with_gender.tsv", delimiter="\t")
# filter out examples where we failed to infer gender
genders = ['male', 'female']
df_filtered = merged[merged['gender'].isin(genders)]
df_filtered.shape

(53401, 11)

Stolen from stanford people: https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt

In [56]:
stop_words = [w.strip() for w in open("stopwords.txt").readlines()]
stop_words[:10]

['!!', '?!', '??', '!?', '`', '``', "''", '-lrb-', '-rrb-', '-lsb-']

Let's remove gender pronouns since these aren't terribly interesting as predictors

In [74]:
gender_pronouns = ["she", "her", "hers", "his", "he", "guy", "she's", "he's", "commentshe", "man", "woman"]
stop_words.extend(gender_pronouns)

How many females v males?

In [75]:
df_filtered["gender"].describe()

count     53401
unique        2
top        male
freq      38272
Name: gender, dtype: object

So about 72% male

In [76]:
vectorizer = CountVectorizer(max_features=20000, min_df=5, ngram_range=(1, 2), binary=True, stop_words=stop_words)
X = vectorizer.fit_transform(df_filtered["review-text"])
y = df_filtered["gender"]

Note; should probably pull out and pool reviews for physicians -- here we treat individual reviews for the same physician as multiple / independent instances

In [77]:
param_grid = {"C":[.01, .1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(), param_grid=param_grid, scoring="f1_macro")

In [78]:
clf.fit(X,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring='f1_macro', verbose=0)

In [79]:
clf.best_score_

0.55358386284680705

In [80]:
def show_most_informative_features(vectorizer, clf, n=50):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [81]:
show_most_informative_features(vectorizer, clf.best_estimator_, n=100)

	-2.5306	jahan          		2.5473	jerk           
	-2.4798	vagotis        		2.4087	gentleman      
	-2.3883	daiza          		1.8683	accept insurance
	-2.2913	haffner        		1.8632	knee           
	-2.2312	dr samuels     		1.7878	hime           
	-2.2303	dr daiza       		1.7630	david          
	-2.1835	abarikwu       		1.7609	parker         
	-2.1468	conte          		1.7358	commentshis    
	-2.1272	fauzia         		1.7199	see highly     
	-2.1002	susan          		1.6786	called schedule
	-2.0950	maclin         		1.6593	thompson       
	-2.0708	anne           		1.6528	massive        
	-2.0695	zamora         		1.6501	office appointment
	-2.0499	felluca        		1.6422	may good       
	-2.0416	toole          		1.6217	major surgery  
	-2.0313	barber         		1.6146	arrogant       
	-1.9602	dr lanna       		1.5953	talk office    
	-1.9602	lanna          		1.5930	cocky          
	-1.9494	afr            		1.5824	forseter       
	-1.9429	ham            		1.5730	nance          
	-1.9369	lamotte