In [1]:
# Investigating key research questions in the Sexual Harassment in Academia: Results of a Crowdsourced Survey dataset
# Using NLP (topic modeling, classification, etc.) techniques with Spark, MLLIB, pandas, NLTK, and Python. 
# Final project for SI 618.

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer 
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.clustering import LDA
from pyspark.ml.pipeline import Pipeline
import numpy as np
import nltk

In [5]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords as nltkstopwords

In [19]:
df = spark.read.load("harassment_mainrawdata_4_4_18.csv",
                     format="csv", sep=",", inferSchema="true", header="true")

In [4]:
df.head()

Row(time,event,target,perpetrator,itype raw,institution,discipline,oldq,response,punishment,career,mental,life,comments,perpgender,oldlabel,someresponse,gendersquash,oldcleandiscipline,coarsediscipline,cleantarget,coarsetarget,itype,oldinstitution,label,cleaninstitution,cleandiscipline='3/17/2018 12')

In [6]:
from pyspark.sql.types import ArrayType, StringType

def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

In [20]:
print(df.columns)

['time', 'event', 'target', 'perpetrator', 'itype raw', 'institution', 'discipline', 'oldq', 'response', 'punishment', 'career', 'mental', 'life', 'comments', 'perpgender', 'oldlabel', 'someresponse', 'gendersquash', 'oldcleandiscipline', 'coarsediscipline', 'cleantarget', 'coarsetarget', 'itype', 'oldinstitution', 'label', 'cleaninstitution', 'cleandiscipline']


In [25]:
df = df.na.drop()

In [26]:
def fullPipeline(kTerm):
  k = kTerm

  tokenizer = Tokenizer(inputCol='response', outputCol="words")
  
    
  stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
  stopWordsRemover.loadDefaultStopWords("english")

  vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", minDF=2) 

  print ("k = ",k)
  lda = LDA(k=6, maxIter=10)

  pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, vectorizer, lda])
  pipelineModel = pipeline.fit(df)

  countVectorModel = pipelineModel.stages[-2]
  cmv = countVectorModel.vocabulary
  print("Vocab length is",len(cmv))

  ldaModel = pipelineModel.stages[-1]

  # Assess the model
  df_lda = pipelineModel.transform(df)

  lp = ldaModel.logPerplexity(df_lda)
  print("Log perplexity  (lower is better): ",lp)
  ll = ldaModel.logLikelihood(df_lda)
  print("Log likelihood (higher is better): ",ll)
  # Describe topics.

  topics = ldaModel.describeTopics(8)

  topics = topics.withColumn(
      "topicWords", indices_to_terms(countVectorModel.vocabulary)("termIndices"))
  topics.select("topicWords").show(10,truncate=False)

fullPipeline(7)


k =  7
Vocab length is 19
Log perplexity  (lower is better):  4.642170904376941
Log likelihood (higher is better):  -324.95196300305986


NameError: name 'udf' is not defined