In [7]:
import os
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [8]:
merged = pd.read_csv("merged_with_gender.tsv", delimiter="\t")
# filter out examples where we failed to infer gender
genders = ['male', 'female']
df_filtered = merged[merged['gender'].isin(genders)]
df_filtered.shape

(53401, 11)

How many females v males?

In [9]:
df_filtered["gender"].describe()

count     53401
unique        2
top        male
freq      38272
Name: gender, dtype: object

So about 72% male

In [10]:
vectorizer = CountVectorizer(max_features=20000, min_df=5, binary=True)
X = vectorizer.fit_transform(df_filtered["review-text"])
y = df_filtered["gender"]

Note; should probably pull out and pool reviews for physicians -- here we treat individual reviews for the same physician as multiple / independent instances

In [13]:
param_grid = {"C":[.01, .1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(), param_grid=param_grid)

In [14]:
clf.fit(X,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [16]:
clf.best_score_

0.93927080017228137

In [18]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [20]:
show_most_informative_features(vectorizer, clf.best_estimator_)

	-2.5893	she            		2.8288	he             
	-2.1405	her            		1.9279	his            
	-0.3423	commentsshe    		1.6625	him            
	-0.2599	herself        		0.4599	surgery        
	-0.2428	son            		0.3748	man            
	-0.1737	practice       		0.3516	mother         
	-0.1549	gyn            		0.3440	daughter       
	-0.1528	is             		0.3386	guy            
	-0.1496	concerns       		0.2504	pain           
	-0.1358	new            		0.2440	wife           
	-0.1265	caring         		0.2261	commentshe     
	-0.1251	worst          		0.2197	himself        
	-0.1249	me             		0.2190	surgeon        
	-0.1241	finally        		0.1810	arrogant       
	-0.1206	poor           		0.1750	good           
	-0.1154	health         		0.1694	receptionist   
	-0.1095	cold           		0.1461	great          
	-0.1064	sick           		0.1391	old            
	-0.1055	delivery       		0.1334	desk           
	-0.1050	child          		0.1325	girl           
