In [1]:
# Investigating key research questions in the Sexual Harassment in Academia: Results of a Crowdsourced Survey dataset
# Using NLP (topic modeling, classification, etc.) techniques with Spark, MLLIB, pandas, NLTK, and Python. 
# Final project for SI 618.

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer 
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.clustering import LDA
from pyspark.ml.pipeline import Pipeline
import numpy as np
import nltk

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords as nltkstopwords

In [4]:
df = spark.read.load("/FileStore/tables/harassment_mainrawdata_4_4_18.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

In [5]:
df.head()

In [6]:
from pyspark.sql.types import ArrayType, StringType

def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

In [7]:
print(df.columns)

In [8]:
df.count()
# df = df.na.fill('')
import numpy as np
df = df.na.drop(subset=['event', 'perpetrator', 'mental', 'response', 'career', 'life', 'punishment', 'time'], how="any")


In [9]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql import functions as F

for c in df.columns:
  df = df.withColumn(c, regexp_replace(df[c], "\p{Punct}", ""))
  df = df.withColumn(c, F.lower(F.col(c)))
# df.toDF(*[regexp_replace(df[c], "\p{Punct}", "") for c in df.columns])
# # for col in df.columns:
# #     df = df.withColumn(col,regexp_replace(df[col], "\p{Punct}", "")).collect()[0]
df.head(5)

In [10]:
df.count()

In [11]:
def blank_as_null(x):
    return when(col(x) != "", col(x)).otherwise(None)

def blank_as_null_with_line(x):
  return when(col(x) != " ", col(x)).otherwise(None)

def blank_as_null_with_2line(x):
  return when(col(x) != "  ", col(x)).otherwise(None)

In [12]:
from pyspark.sql.functions import col, when

def fullPipeline(kTerm, c):
  currentDF = df

  currentDF = currentDF.withColumn(c, blank_as_null(c))
  currentDF = currentDF.withColumn(c, blank_as_null_with_line(c))
  currentDF = currentDF.withColumn(c, blank_as_null_with_2line(c))

  keep = currentDF.select([c])
  currentDF = keep.na.drop(how="any")
  
  
  kT = kTerm

  tokenizer = Tokenizer(inputCol=c, outputCol="words")

  stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
  stopWordsRemover.loadDefaultStopWords("english")

#   vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2) 
  vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2) 

  print ("k = ",kT)
  lda = LDA(k=kT, maxIter=10)

  pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, vectorizer, lda])
  pipelineModel = pipeline.fit(currentDF)

  countVectorModel = pipelineModel.stages[-2]
  cmv = countVectorModel.vocabulary
  print("Vocab length is",len(cmv))

  ldaModel = pipelineModel.stages[-1]

  # Assess the model
  df_lda = pipelineModel.transform(currentDF)

  lp = ldaModel.logPerplexity(df_lda)
  print("Log perplexity  (lower is better): ",lp)
  ll = ldaModel.logLikelihood(df_lda)
  print("Log likelihood (higher is better): ",ll)
  # Describe topics.

  topics = ldaModel.describeTopics(8)

  topics = topics.withColumn(
      "topicWords", indices_to_terms(countVectorModel.vocabulary)("termIndices"))
  topics.select("topicWords").show(10,truncate=False)
  
  return topics


In [13]:
fullPipeline(7, "response")

In [14]:
fullPipeline(10, "response")

In [15]:
fullPipeline(10, "event")

In [16]:
fullPipeline(5, "event")

In [17]:
fullPipeline(10, "mental")

In [18]:
fullPipeline(10, "life")

In [19]:
fullPipeline(6, "mental")

In [20]:
fullPipeline(5, "event")

In [21]:
num_topics = 20
e_outTopics = fullPipeline(num_topics, "event")
e_outTopics.select("topicWords").show(num_topics,truncate=False)

In [22]:
r_outTopics = fullPipeline(num_topics, "response")
r_outTopics.select("topicWords").show(num_topics,truncate=False)

In [23]:
p_outTopics = fullPipeline(num_topics, "punishment")
p_outTopics.select("topicWords").show(num_topics,truncate=False)

In [24]:
c_outTopics = fullPipeline(num_topics, "career")
c_outTopics.select("topicWords").show(num_topics,truncate=False)

In [25]:
m_outTopics = fullPipeline(num_topics, "mental")
m_outTopics.select("topicWords").show(num_topics,truncate=False)

In [26]:
l_outTopics = fullPipeline(num_topics, "life")
l_outTopics.select("topicWords").show(num_topics,truncate=False)

In [27]:
num_topics = 10
e_outTopics = fullPipeline(num_topics, "event")
e_outTopics.select("topicWords").show(num_topics,truncate=False)

In [28]:
num_topics = 15
e_outTopics = fullPipeline(num_topics, "event")
e_outTopics.select("topicWords").show(num_topics,truncate=False)

In [29]:
display(e_outTopics.select("topicWords"))

In [30]:
r_outTopics = fullPipeline(num_topics, "response")
display(r_outTopics.select("topicWords"))

In [31]:
p_outTopics = fullPipeline(num_topics, "punishment")
display(p_outTopics.select("topicWords"))

In [32]:
m_outTopics = fullPipeline(num_topics, "mental")
display(m_outTopics.select("topicWords"))

In [33]:
l_outTopics = fullPipeline(num_topics, "life")
display(l_outTopics.select("topicWords"))

In [34]:
c_outTopics = fullPipeline(num_topics, "career")
display(c_outTopics.select("topicWords"))

In [35]:
import pandas

pandasDF = df.toPandas()
pandasDF['IsProf'] = pandasDF['perpetrator'].str.contains("prof|chair|faculty")
pandasDF['IsProf'] = (pandasDF['IsProf'] == True).astype(int)


# spark_df = sqlContext.createDataFrame(pOut)



In [36]:
pandasDF.columns

In [37]:
pandasDF.rename(columns={'time':'time2'}, inplace=True)

In [38]:
import numpy as np
# pandasDF.replace(r'\s+', np.nan, regex=True)
# pandasDF.dropna(subset=['event', 'perpetrator', 'mental', 'response', 'career'], how="any")

In [39]:
len(pandasDF)

In [40]:

profOut = pandasDF[['time2', 'IsProf']].loc[pandasDF['IsProf'] == True]
not_profOut = pandasDF[['time2', 'IsProf']].loc[pandasDF['IsProf'] == False]

In [41]:
pandasDF.head()

In [42]:
all_with_indicators = sqlContext.createDataFrame(pandasDF[['time2', 'IsProf']])

In [43]:
profs_df = sqlContext.createDataFrame(profOut)
not_profs_df = sqlContext.createDataFrame(not_profOut)

In [44]:
professors_spark = df.join(profs_df, df.time == profs_df.time2, 'right')
no_professors_spark = df.join(not_profs_df, df.time == not_profs_df.time2, 'right')

In [45]:
all_with_indicators_spark = df.join(all_with_indicators, df.time == all_with_indicators.time2, 'right')


In [46]:
all_with_indicators_spark.count()

In [47]:
all_with_indicators_spark.show(5)

In [48]:
professors_spark.count()

In [49]:
no_professors_spark.count()

In [50]:
def fullPipelineDFSpecific(theData, kTerm, c):
  kT = kTerm
  
  theData = theData.withColumn(c, blank_as_null(c))
  theData = theData.withColumn(c, blank_as_null_with_line(c))
  theData = theData.withColumn(c, blank_as_null_with_2line(c))

  keep = theData.select([c])
  theData = keep.na.drop(how="any")

  tokenizer = Tokenizer(inputCol=c, outputCol="words")

  stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
  stopWordsRemover.loadDefaultStopWords("english")

#   vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2) 
  vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2) 

  print ("k = ",kT)
  lda = LDA(k=kT, maxIter=10)

  pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, vectorizer, lda])
  pipelineModel = pipeline.fit(theData)

  countVectorModel = pipelineModel.stages[-2]
  cmv = countVectorModel.vocabulary
  print("Vocab length is",len(cmv))

  ldaModel = pipelineModel.stages[-1]

  # Assess the model
  df_lda = pipelineModel.transform(theData)

  lp = ldaModel.logPerplexity(df_lda)
  print("Log perplexity  (lower is better): ",lp)
  ll = ldaModel.logLikelihood(df_lda)
  print("Log likelihood (higher is better): ",ll)
  # Describe topics.

  topics = ldaModel.describeTopics(8)

  topics = topics.withColumn(
      "topicWords", indices_to_terms(countVectorModel.vocabulary)("termIndices"))
  topics.select("topicWords").show(10,truncate=False)
  
  return topics


In [51]:
mental_prof_topics = fullPipelineDFSpecific(professors_spark, num_topics, "mental")
display(mental_prof_topics.select("topicWords"))

In [52]:
mental_noprof_topics = fullPipelineDFSpecific(no_professors_spark, num_topics, "mental")
display(mental_noprof_topics.select("topicWords"))

In [53]:
mr_prof_topics = fullPipelineDFSpecific(professors_spark, num_topics, "response")
display(mr_prof_topics.select("topicWords"))

In [54]:
mr_noprof_topics = fullPipelineDFSpecific(no_professors_spark, num_topics, "response")
display(mr_noprof_topics.select("topicWords"))

In [55]:
p_prof_topics = fullPipelineDFSpecific(professors_spark, num_topics, "punishment")
display(p_prof_topics.select("topicWords"))

In [56]:
p_noprof_topics = fullPipelineDFSpecific(no_professors_spark, num_topics, "response")
display(p_noprof_topics.select("topicWords"))

In [57]:
c_prof_topics = fullPipelineDFSpecific(professors_spark, num_topics, "career")
display(c_prof_topics.select("topicWords"))

In [58]:
c_noprof_topics = fullPipelineDFSpecific(no_professors_spark, num_topics, "career")
display(c_noprof_topics.select("topicWords"))

In [59]:
full_profOut = pandasDF.loc[pandasDF['IsProf'] == True]
full_not_profOut = pandasDF.loc[pandasDF['IsProf'] == False]


In [60]:
from pyspark.ml.feature import NGram
from pyspark.ml import Pipeline

def generateNGrams(theData, col, n):
  tokenizer = Tokenizer(inputCol=col, outputCol="words")
  stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
  stopWordsRemover.loadDefaultStopWords("english")

  ngrams = NGram(n=n, inputCol='words', outputCol=col + "_" + str(n)+'-grams')

  # build pipeline model
  pipeline = Pipeline(stages=[tokenizer, stopWordsRemover,ngrams])

  # transform data
  text_ngrams = pipeline.fit(theData).transform(theData)
  return text_ngrams

In [61]:
out = generateNGrams(professors_spark, "mental", 3)

In [62]:
outPandas = out.toPandas()[['mental_3-grams']]

In [63]:
outPandas.head(5)
rowsDict = {}

for index, row in outPandas.iterrows():
   if len(row['mental_3-grams']) > 0:
      for ngram in row['mental_3-grams']:
        if ngram not in rowsDict:
          rowsDict[ngram] = 0
        rowsDict[ngram] += 1
        
ngramSorted = sorted(rowsDict, key=rowsDict.get, reverse=True)
for top in ngramSorted[0:20]:
  print(str(top) + ": "+ str(rowsDict[top]))


In [64]:
def generateNGramsFull(theData, col, n):
  ngram_string = col + "_" + str(n)+'-grams'
  tokenizer = Tokenizer(inputCol=col, outputCol="words")
  stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
  stopWordsRemover.loadDefaultStopWords("english")

  ngrams = NGram(n=n, inputCol='words', outputCol=ngram_string)

  # build pipeline model
  pipeline = Pipeline(stages=[tokenizer, stopWordsRemover,ngrams])

  # transform data
  text_ngrams = pipeline.fit(theData).transform(theData)
  
  outPandas = text_ngrams.toPandas()[[ngram_string]]
  
  rowsDict = {}

  for index, row in outPandas.iterrows():
     if len(row[ngram_string]) > 0:
        for ngram in row[ngram_string]:
          if ngram not in rowsDict:
            rowsDict[ngram] = 0
          rowsDict[ngram] += 1

  ngramSorted = sorted(rowsDict, key=rowsDict.get, reverse=True)
  sorted_top = []
  for top in ngramSorted[0:20]:
    sorted_top.append({"ngram": top, "count": rowsDict[top]})

  
  return pandas.DataFrame(sorted_top)

In [65]:
import seaborn as sns
import matplotlib.pyplot as plt

In [66]:
outResponse = generateNGramsFull(professors_spark, "response", 3)[0:12]

viz_left_response = response_left_output[0:12]
f, ax = plt.subplots(figsize=(12, 12))
sns.set(style="darkgrid")
sns.barplot(y="ngram",x="count",data=outResponse).set_title('Common trigrams in "response" descriptions for professor/factuly/chair perpetrators')

f.subplots_adjust(left=0.2)
display(f.figure)

In [67]:
generateNGramsFull(no_professors_spark, "response", 3)

In [68]:
generateNGramsFull(professors_spark, "career", 3)

In [69]:
prof_career_4 = generateNGramsFull(professors_spark, "career", 4)

f, ax = plt.subplots(figsize=(12, 12))
sns.set(style="darkgrid")
sns.barplot(y="index",x="featureImportances",data=viz_left_response).set_title('Response descriptors in sexual harassment reports that predict "left" or "quit" outcomes')

f.subplots_adjust(left=0.2)
display(f.figure)

In [70]:
generateNGramsFull(professors_spark, "punishment", 3)

In [71]:
generateNGramsFull(professors_spark, "event", 3)

In [72]:
generateNGramsFull(professors_spark, "event", 4)

In [73]:
generateNGramsFull(no_professors_spark, "event", 4)

In [74]:
generateNGramsFull(professors_spark, "response", 4)

In [75]:
generateNGramsFull(professors_spark, "punishment", 4)

In [76]:
generateNGramsFull(no_professors_spark, "response", 4)

In [77]:
generateNGramsFull(no_professors_spark, "punishment", 5)

In [78]:
out_prof_event_4 = generateNGramsFull(professors_spark, "event", 4)[0:8]

f, ax = plt.subplots(figsize=(9, 9))
sns.set(style="darkgrid")
sns.barplot(y="ngram",x="count",data=out_prof_event_4).set_title('Common 4-grams in "event" descriptions for professor/faculty/chair perpetrators')

f.subplots_adjust(left=0.25)
display(f.figure)

In [79]:
out_noprof_event_4 = generateNGramsFull(no_professors_spark, "event", 4)[0:8]

f, ax = plt.subplots(figsize=(9, 9))
sns.set(style="darkgrid")
sns.barplot(y="ngram",x="count",data=out_noprof_event_4).set_title('Common 4-grams in "event" descriptions for non-faculty perpetrators')

f.subplots_adjust(left=0.25)
display(f.figure)

In [80]:
out_prof_punishment_3 = generateNGramsFull(professors_spark, "punishment", 3)[0:8]

f, ax = plt.subplots(figsize=(9, 9))
sns.set(style="darkgrid")
sns.barplot(y="ngram",x="count",data=out_prof_punishment_3).set_title('Common trigrams in "punishment" descriptions for Professor/Faculty/Chair perpetrators')

f.subplots_adjust(left=0.25)
display(f.figure)

In [81]:
out_noprof_punishment_3 = generateNGramsFull(no_professors_spark, "punishment", 3)[0:8]

f, ax = plt.subplots(figsize=(9, 9))
sns.set(style="darkgrid")
sns.barplot(y="ngram",x="count",data=out_noprof_punishment_3).set_title('Common trigrams in "punishment" descriptions for non-faculty perpetrators')

f.subplots_adjust(left=0.25)
display(f.figure)

In [82]:
generateNGramsFull(professors_spark, "career", 5)

In [83]:
generateNGramsFull(professors_spark, "mental", 5)

In [84]:
for c in ["event","punishment","response","mental","career","life"]:
  for n in [3,4,5]:
    print(c+ ',' + str(n)+'-grams,\n')
    generateNGramsFull(professors_spark, c, n)

In [85]:
all_with_indicators_spark.show(5)

#Training a Classifier

In [87]:
# #drop nulls!!!!

# from pyspark.sql.functions import col, isnan, when, trim

# print(all_with_indicators_spark.count())

# def to_null(c):
#     return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == " ")), col(c))

# columnsToConsider = ['perpetrator', 'response', 'mental', 'life', 'event']

# all_with_indicators_spark.select([to_null(c).alias(c) for c in columnsToConsider]).na.drop()
# print(all_with_indicators_spark.count())


In [88]:
columnsToNullifyBlanks = ['perpetrator', 'career', 'response', 'mental', 'life', 'event']

from pyspark.sql.functions import col, when

def blank_as_null(x):
    return when(col(x) != "", col(x)).otherwise(None)

def blank_as_null_with_line(x):
  return when(col(x) != " ", col(x)).otherwise(None)

def blank_as_null_with_2line(x):
  return when(col(x) != "  ", col(x)).otherwise(None)

for c in columnsToNullifyBlanks:
  all_with_indicators_spark = all_with_indicators_spark.withColumn(c, blank_as_null(c))
  all_with_indicators_spark = all_with_indicators_spark.withColumn(c, blank_as_null_with_line(c))
  all_with_indicators_spark = all_with_indicators_spark.withColumn(c, blank_as_null_with_2line(c))

columnsToNullifyBlanks.append('IsProf')
keep = all_with_indicators_spark.select(columnsToNullifyBlanks)
all_with_indicators_spark = keep.na.drop(how="any")

In [89]:
all_with_indicators_spark.show(15)

In [90]:
all_with_indicators_spark.count()

In [91]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def ClassifierWithVector(theData, col):
  splits = theData.randomSplit([0.8, 0.2], 1234)

  # Training gets the 80%
  theData_train = splits[0]

  # Testing gets the 20%
  theData_test = splits[1]

  tokenizer = Tokenizer(inputCol=col, outputCol="words")

  stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
  stopWordsRemover.loadDefaultStopWords("english")

#   vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2) 
  cv = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=300, minDF=4.0)
  rf = RandomForestClassifier(labelCol="IsProf", featuresCol="features", numTrees=64)

  pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, cv, rf])
  pipelineModel = pipeline.fit(theData_train)
  pipelinePredictions = pipelineModel.transform(theData_test)
#   display(pipelinePredictions.select("prediction","IsProf"))
  
  # compute accuracy on the test set
  evaluator = MulticlassClassificationEvaluator(labelCol="IsProf", predictionCol="prediction",metricName="accuracy")
  accuracy = evaluator.evaluate(pipelinePredictions)
  print("Test set accuracy = " + str(accuracy))
#   print(pipelineModel.vocabulary)

#   featureImportances2 = pandas.DataFrame({"index":pipelineModel.vocabulary,"featureImportances":})\
#     .sort_values("featureImportances", ascending=False)
#   print(featureImportances)
#   print(pipelineModel.stages[-2].vocabulary)
#   print(pipelineModel.stages[-1].featureImportances)
  featureImportances = pandas.DataFrame({"index":pipelineModel.stages[-2].vocabulary,"featureImportances":pipelineModel.stages[-1].featureImportances}).sort_values("featureImportances", ascending=False)
  print(featureImportances)
  return featureImportances



In [92]:
event_prof_output= ClassifierWithVector(all_with_indicators_spark, "event")

In [93]:
response_prof_output = ClassifierWithVector(all_with_indicators_spark, "response")


In [94]:
type(response_prof_output.ix[0]['index'])

In [95]:
mental_prof_output = ClassifierWithVector(all_with_indicators_spark, "mental")

In [96]:
career_prof_output = ClassifierWithVector(all_with_indicators_spark, "career")


In [97]:
life_prof_output = ClassifierWithVector(all_with_indicators_spark, "life")

In [98]:
pandasDF2 = df.toPandas()
# Merge cells
pandasDF2['combined']=pandasDF2['career'].astype(str)+' '+ pandasDF2['life'].astype(str) + pandasDF2['mental'].astype(str)
pandasDF2['Left'] = pandasDF2['combined'].str.contains("left|quit|change")
pandasDF2['Left'] = (pandasDF2['Left'] == True).astype(int)
pandasDF2['Left'].value_counts()

In [99]:
pandasDF2.rename(columns={'time':'time2'}, inplace=True)

In [100]:
left_with_indicators = sqlContext.createDataFrame(pandasDF2[['time2', 'Left']])
left_with_indicators_spark = df.join(left_with_indicators, df.time == left_with_indicators.time2, 'right')

In [101]:
columnsToNullifyBlanks = ['perpetrator', 'career', 'response', 'mental', 'life', 'event', 'punishment']

def blank_as_null(x):
    return when(col(x) != "", col(x)).otherwise(None)

def blank_as_null_with_line(x):
  return when(col(x) != " ", col(x)).otherwise(None)

def blank_as_null_with_2line(x):
  return when(col(x) != "  ", col(x)).otherwise(None)

for c in columnsToNullifyBlanks:
  left_with_indicators_spark = left_with_indicators_spark.withColumn(c, blank_as_null(c))
  left_with_indicators_spark = left_with_indicators_spark.withColumn(c, blank_as_null_with_line(c))
  left_with_indicators_spark = left_with_indicators_spark.withColumn(c, blank_as_null_with_2line(c))

columnsToNullifyBlanks.append('Left')
keep = left_with_indicators_spark.select(columnsToNullifyBlanks)
left_with_indicators_spark = keep.na.drop(how="any")

In [102]:
def LeftClassifierWithVector(theData, col):
  splits = theData.randomSplit([0.85, 0.15], 123)

  # Training gets the 80%
  theData_train = splits[0]

  # Testing gets the 20%
  theData_test = splits[1]

  tokenizer = Tokenizer(inputCol=col, outputCol="words")

  stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
  stopWordsRemover.loadDefaultStopWords("english")

#   vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2) 
  cv = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=150, minDF=4.0)
  rf = RandomForestClassifier(labelCol="Left", featuresCol="features", numTrees=64)

  pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, cv, rf])
  pipelineModel = pipeline.fit(theData_train)
  pipelinePredictions = pipelineModel.transform(theData_test)
#   display(pipelinePredictions.select("prediction","IsProf"))
  
  # compute accuracy on the test set
  evaluator = MulticlassClassificationEvaluator(labelCol="Left", predictionCol="prediction",metricName="accuracy")
  accuracy = evaluator.evaluate(pipelinePredictions)
  print("Test set accuracy = " + str(accuracy))
#   print(pipelineModel.vocabulary)

#   featureImportances2 = pandas.DataFrame({"index":pipelineModel.vocabulary,"featureImportances":})\
#     .sort_values("featureImportances", ascending=False)
#   print(featureImportances)
#   print(pipelineModel.stages[-2].vocabulary)
#   print(pipelineModel.stages[-1].featureImportances)
  featureImportances = pandas.DataFrame({"index":pipelineModel.stages[-2].vocabulary,"featureImportances":pipelineModel.stages[-1].featureImportances}).sort_values("featureImportances", ascending=False)
  print(featureImportances)
  return featureImportances


In [103]:
response_left_output = LeftClassifierWithVector(left_with_indicators_spark, "response")

In [104]:
event_left_output = LeftClassifierWithVector(left_with_indicators_spark, "event")

In [105]:
punishment_left_output = LeftClassifierWithVector(left_with_indicators_spark, "punishment")

## Visualizations of important factors to predict outcomes (prof or no prof, left/quit or no)

In [107]:
import seaborn as sns
import matplotlib.pyplot as plt

viz_left_event = event_left_output[0:12]
f, ax = plt.subplots(figsize=(9, 9))
sns.set(style="darkgrid")
sns.barplot(y="index",x="featureImportances",data=viz_left_event).set_title('Event descriptors in sexual harassment reports that are predictive of "left"/"quit" outcomes')

f.subplots_adjust(left=0.15)
display(f.figure)

In [108]:
import seaborn as sns
import matplotlib.pyplot as plt

viz_left_response = response_left_output[0:12]
f, ax = plt.subplots(figsize=(9, 9))
sns.set(style="darkgrid")
sns.barplot(y="index",x="featureImportances",data=viz_left_response).set_title('Response descriptors in sexual harassment reports that are predictive of "left"/"quit" outcomes')

f.subplots_adjust(left=0.15)
display(f.figure)

In [109]:
import seaborn as sns
import matplotlib.pyplot as plt

viz_left_punishment = punishment_left_output[0:12]
f, ax = plt.subplots(figsize=(9, 9))
sns.set(style="darkgrid")
sns.barplot(y="index",x="featureImportances",data=viz_left_punishment).set_title('Punishment descriptors in sexual harassment reports that are predictive of "left"/"quit" outcomes')

f.subplots_adjust(left=0.15)
display(f.figure)