In [1]:
import os
import pandas as pd
import numpy as np
import sklearn
import statsmodels.api as sm
import patsy
from patsy import dmatrices
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
merged = pd.read_csv("merged_with_gender.tsv", delimiter="\t")
# filter out examples where we failed to infer gender
genders = ['male', 'female']
df_filtered = merged[merged['gender'].isin(genders)]
# some cleaning
df_filtered["knowledge"] = pd.to_numeric(df_filtered["knowledge"].str.strip("&nbsp;"), errors='coerce')
df_filtered.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(53401, 11)

In [3]:
df_filtered.head()

Unnamed: 0.1,Unnamed: 0,dr-id,gendel,gender,helpful,insurance,knowledge,punctual,rating,review-text,staff
0,0,1,,male,5,Insurance:Unknown,5,5,1,A CARING DOCTOR WHO CARES ABOUT HIS PATIENTS. ...,5
1,1,1,,male,5,Insurance:Medicare,5,5,1,We have used Dr. Carden all of his career and ...,5
2,2,1,,male,5,Insurance:Blue Cross / Blue Shield,5,5,1,"Very good doctor, caring. I think he knows e...",5
3,3,1,,male,5,Insurance:United Healthcare,5,4,1,Dr. Carden is an outstanding physician. He spe...,4
4,4,1,,male,5,Insurance:Unknown,5,5,1,Dr.Carden Is a great doctor that listens to hi...,5


In [4]:
grouped = df_filtered.groupby('dr-id')
ok = list(grouped)[0]
np.mean(ok[1]["helpful"].values)

5.0

Stolen from stanford people: https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt

In [5]:
stop_words = [w.strip() for w in open("stopwords.txt").readlines()]
stop_words[:10]

['!!', '?!', '??', '!?', '`', '``', "''", '-lrb-', '-rrb-', '-lsb-']

Let's remove gender pronouns since these aren't terribly interesting as predictors

In [6]:
gender_pronouns = ["she", "her", "hers", "his", "he", "guy", "she's", "he's", "commentshe", "commentsshe", "man", "woman"]
stop_words.extend(gender_pronouns)

Merge reviews for individual doctors (we have multiple reviews for each!)

In [7]:
unique_drs = df_filtered["dr-id"].unique()
unique_drs.shape
grouped = df_filtered.groupby('dr-id')
texts, sexes, helpful, knowledge, punctual, ids = [], [], [], [], [], []
for i, dr in grouped:
    cur_text = " ".join(dr['review-text'].values)
    texts.append(cur_text)
    cur_sex = dr["gender"].values[0]
    #import pdb; pdb.set_trace()
    helpful.append(dr["helpful"].mean(skipna=True)) # assuming missing at random!
    knowledge.append(dr["knowledge"].mean(skipna=True))
    punctual.append(dr["punctual"].mean(skipna=True))
    ids.append(dr["dr-id"].values[0])
    sexes.append(cur_sex)

dr_frame = pd.DataFrame({"sex":sexes, "y_helpful":helpful, "y_knowledge":knowledge, "y_punctual":punctual, "id":ids})
dr_frame_no_missing = dr_frame.dropna()

In [8]:
dr_frame_no_missing.shape

(16469, 5)

How many females v males?

In [9]:
sexes.count("female")/float(len(sexes))
#sexes.count("female")

0.2827510917030568

So about 72% male

Note; should probably pull out and pool reviews for physicians -- here we treat individual reviews for the same physician as multiple / independent instances

## Regression analysis for gender-score 

**Important caveats** we are making several assumptions here, including:
* We are treating the means as a point estimates and not explicitly weighting by variance
* We are ignoring what are likely confounders, including specialties -- e.g., there may be certain specialties that receive lower scores and perhaps women are overrepresented in these. We have no way of knowing.

In [10]:
y_helpful, X = dmatrices('y_helpful ~ sex', data=dr_frame_no_missing, return_type='dataframe')
mod = sm.OLS(y_helpful, X)
res_helpful = mod.fit()
print(res_helpful.summary())

                            OLS Regression Results                            
Dep. Variable:              y_helpful   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     13.82
Date:                Fri, 28 Oct 2016   Prob (F-statistic):           0.000202
Time:                        12:34:23   Log-Likelihood:                -28327.
No. Observations:               16469   AIC:                         5.666e+04
Df Residuals:                   16467   BIC:                         5.667e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept       3.6471      0.020    184.148      

In [11]:
y_knowledge, X = dmatrices('y_knowledge ~ sex', data=dr_frame_no_missing, return_type='dataframe')
mod = sm.OLS(y_knowledge, X)
res_knowledge = mod.fit()
print(res_knowledge.summary())

                            OLS Regression Results                            
Dep. Variable:            y_knowledge   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     17.48
Date:                Fri, 28 Oct 2016   Prob (F-statistic):           2.92e-05
Time:                        12:34:37   Log-Likelihood:                -27045.
No. Observations:               16469   AIC:                         5.409e+04
Df Residuals:                   16467   BIC:                         5.411e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept       3.8141      0.018    208.171      

In [13]:
y_punctual, X = dmatrices('y_punctual ~ sex', data=dr_frame_no_missing, return_type='dataframe')
mod = sm.OLS(y_punctual, X)
res_punc = mod.fit()
print(res_punc.summary())

                            OLS Regression Results                            
Dep. Variable:             y_punctual   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     41.23
Date:                Fri, 28 Oct 2016   Prob (F-statistic):           1.39e-10
Time:                        12:35:19   Log-Likelihood:                -26066.
No. Observations:               16469   AIC:                         5.214e+04
Df Residuals:                   16467   BIC:                         5.215e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept       3.6203      0.017    209.700      

## Predictive analysis; I think Michael's is much better!

In [None]:
vectorizer = CountVectorizer(max_features=20000, min_df=25, ngram_range=(1,1), binary=True, stop_words=stop_words)
X = vectorizer.fit_transform(texts)
y = sexes
param_grid = {"C":[.01, .1, 1]}
clf = GridSearchCV(LogisticRegression(), param_grid=param_grid, scoring="f1_macro")

In [None]:
clf.fit(X,y)

In [None]:
clf.best_score_

In [None]:
def show_most_informative_features(vectorizer, clf, n=50):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [None]:
show_most_informative_features(vectorizer, clf.best_estimator_, n=100)