# Natural Language Processing and Sentiment Analysis

## Set up and Start Spark session

In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MentalHealthNLP").getOrCreate()

## Mount Google Drive into this runtime

In [0]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
%cd /content/gdrive/My Drive/data_final_project/mental_health_ML

## Query and load data from Postgres database

### Install and import dependencies

In [0]:
! pip install sqlalchemy

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

### Connect to database

In [0]:
# Database credentials
from config import DB_USERNAME, DB_PASSWORD, DB_ENDPOINT

# Connect to database.
! pip install psycopg2-binary

rds_connection_string = f"{DB_USERNAME}:{DB_PASSWORD}@{DB_ENDPOINT}:5432/mental_health_tech_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [0]:
# Reflect an existing database into a new model.
Base = automap_base()
# Reflect the tables.
Base.prepare(engine, reflect=True)

# Save reference to the tables.
Survey = Base.classes.survey_responses
print(Base.classes.keys())

In [0]:
session = Session(engine)

surveys = session.query(
    Survey.id,
    Survey.conversation_with_employer,
  ).all()

session.close()

### Filter out null values (where survey question was unanswered)

In [0]:
ids = []
conversations = []

for survey in surveys:
  if survey[1] != None:
    ids.append(survey[0])
    conversations.append(survey[1])

In [0]:
len(conversations)

## Sentiment Analysis Using VADER and TextBlob

Sentiment Analysis tries to identify and extract opinions within a given text. The goal of sentiment analysis is to gauge the attitude, sentiments, evaluations, attitudes and emotions of a speaker/writer based on the computational treatment of subjectivity in a text.

For more information, see <https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f>

### Calculate sentiment scores

In [0]:
! pip install vaderSentiment
! pip install textblob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
from textblob import TextBlob

In [0]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return score

In [0]:
example_score = sentiment_analyzer_scores("The phone is super cool.")
example_score

In [0]:
positive_scores = []
negative_scores = []
neutral_scores = []
compound_scores = []
textblob_scores = []
conversation_classes_vader = []
conversation_classes_textblob = []

for conversation in conversations:
  positive_score = sentiment_analyzer_scores(conversation)["pos"]
  negative_score = sentiment_analyzer_scores(conversation)["neg"]
  neutral_score = sentiment_analyzer_scores(conversation)["neu"]
  compound_score = sentiment_analyzer_scores(conversation)["compound"]
  blob = TextBlob(conversation)
  textblob_score = blob.sentiment[0]

  positive_scores.append(positive_score)
  negative_scores.append(negative_score)
  neutral_scores.append(neutral_score)
  compound_scores.append(compound_score)
  textblob_scores.append(textblob_score)

  if compound_score >= 0.05:
    conversation_class_vader = 'positive'

  if compound_score <= -0.05:
    conversation_class_vader = 'negative'
  
  if compound_score < 0.05 and compound_score > -0.05:
    conversation_class_vader = 'neutral'

  if textblob_score < 0:
    conversation_class_textblob = 'negative'

  if textblob_score == 0:
    conversation_class_textblob = 'neutral'

  if textblob_score > 0:
    conversation_class_textblob = 'positive'

  conversation_classes_vader.append(conversation_class_vader)
  conversation_classes_textblob.append(conversation_class_textblob)

In [0]:
import pandas as pd

conversations_df = pd.DataFrame({
    "id": ids,
    "conversation": conversations,
    "positive_score": positive_scores,
    "negative_score": negative_scores,
    "neutral_score": neutral_scores,
    "compound_score": compound_scores,
    "textblob_score": textblob_scores,
    "conversation_class_vader": conversation_classes_vader,
    "conversation_class_textblob": conversation_classes_textblob
})

conversations_df

In [0]:
conversations_df["conversation_class_vader"].value_counts()

The positive, negative, and neutral scores represent the proportion of text that falls in these categories.  All these should add up to 1.

The compound score is a metric that calculates the sum of all the ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment: compound score >= 0.05<br>
neutral sentment: compound score > -0.05 and compound_score < 0.05<br>
negative sentiment: compound score <= -0.05

In [0]:
conversations_df["conversation_class_textblob"].value_counts()

In [0]:
conversations_spark_df = spark.createDataFrame(conversations_df)

conversations_spark_df.show()

### Feature Transformations

In [0]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature
data_df = conversations_spark_df.withColumn('length', length(conversations_spark_df['conversation']))
data_df.show()

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
pos_neg_to_num_vader = StringIndexer(inputCol='conversation_class_vader',outputCol='label_vader')
pos_neg_to_num_textblob = StringIndexer(inputCol='conversation_class_textblob',outputCol='label_textblob')

tokenizer = Tokenizer(inputCol="conversation", outputCol="token_text")

stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')

hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')

idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [0]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline_vader = Pipeline(stages=[pos_neg_to_num_vader, tokenizer, stopremove, hashingTF, idf, clean_up])
data_prep_pipeline_textblob = Pipeline(stages=[pos_neg_to_num_textblob, tokenizer, stopremove, hashingTF, idf, clean_up])

In [0]:
# Fit and transform the pipeline
cleaner_vader = data_prep_pipeline_vader.fit(data_df)
cleaned_vader = cleaner_vader.transform(data_df)

cleaner_textblob = data_prep_pipeline_textblob.fit(data_df)
cleaned_textblob = cleaner_textblob.transform(data_df)

In [0]:
# Show label and resulting features
cleaned_vader = cleaned_vader.withColumnRenamed("label_vader","label")
cleaned_vader.select(['label', 'features']).show()

In [0]:
# Show label and resulting features
cleaned_textblob = cleaned_textblob.withColumnRenamed("label_textblob","label")
cleaned_textblob.select(['label', 'features']).show()

In [0]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training_vader, testing_vader = cleaned_vader.randomSplit([0.9, 0.1])
training_textblob, testing_textblob = cleaned_textblob.randomSplit([0.9, 0.1])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor_vader = nb.fit(training_vader)
predictor_textblob = nb.fit(training_textblob)

In [0]:
# Tranform the model with the testing data
test_results_vader = predictor_vader.transform(testing_vader)
test_results_vader.show(5)

In [0]:
# Tranform the model with the testing data
test_results_textblob = predictor_textblob.transform(testing_textblob)
test_results_textblob.show(5)

In [0]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc_vader = acc_eval.evaluate(test_results_vader)
acc_textblob = acc_eval.evaluate(test_results_textblob)
print("Accuracy of model at predicting sentiment of conversation with VADER was: %f" % acc_vader)
print("Accuracy of model at predicting sentiment of conversation with textblob was: %f" % acc_textblob)

## Word Cloud

In [0]:
! pip install wordcloud

In [0]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
% matplotlib inline

In [0]:
text = " ".join(conversation for conversation in conversations)
print ("There are {} words in the combination of all conversations".format(len(text)))

stopwords = set(STOPWORDS)
stopwords.update(["thing", "spoke", "day", "week", "make", "much", "something", "taking", "made", "go", "took", "take", "let", "going", "never", "things", "taken", "way", "manager"])

# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stopwords, max_words=300, background_color="#333", width=3000, height=1000).generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [0]:
# Save the image in the shared google drive folder:
wordcloud.to_file("/content/gdrive/My Drive/shared_data_final_project/conversations_word_cloud.png")

## Naive Bayes Classifier

In [0]:
from textblob.classifiers import NaiveBayesClassifier, DecisionTreeClassifier

In [0]:
conversations_df.head()

In [0]:
naive_bayes_classifier_df = conversations_df[["conversation", "conversation_class_textblob"]]

naive_bayes_classifier_df.head()

In [0]:
naive_bayes_classifier_df = naive_bayes_classifier_df[naive_bayes_classifier_df['conversation_class_textblob'] != "neutral"]

naive_bayes_classifier_df

In [0]:
naive_bayes_classifier_df["conversation_class_textblob"] = naive_bayes_classifier_df["conversation_class_textblob"].replace({
    "positive": "pos",
    "negative": "neg"
})

naive_bayes_classifier_df.head()

In [0]:
tuples = [tuple(x) for x in naive_bayes_classifier_df.to_numpy()]

In [0]:
len(tuples)

In [0]:
# Create training and testing data.
import random
random.shuffle(tuples)

length = len(tuples)
middle_index = length//2

train = tuples[:middle_index]
test = tuples[middle_index:]

len(train)
len(test)

In [0]:
! pip install -U textblob nltk
! python -m textblob.download_corpora

In [0]:
cl = NaiveBayesClassifier(train)

In [0]:
cl.classify("I talked to my manager about mental health he was very supportive.")

In [0]:
cl.accuracy(test)

In [0]:
cl.show_informative_features(10)

In [0]:
import joblib
joblib.dump(cl, '/content/gdrive/My Drive/shared_data_final_project/nb_classifier2.pkl', compress=9)

## Decision Tree Classifier

In [0]:
# Decision tree classifier
dt_classifier = DecisionTreeClassifier(train)

In [0]:
# Now, let’s check the accuracy of this classifier on the testing dataset.
dt_classifier.accuracy(test)

In [0]:
dt_classifier.classify('I talked to my manager about mental health he was very supportive')

In [0]:
import joblib
joblib.dump(cl, '/content/gdrive/My Drive/shared_data_final_project/dt_classifier.pkl', compress=9)