# Import Libraries and Data

In [1]:
!pip install tweepy --upgrade
!pip install contractions
!pip install nltk
!pip install stop_words
!pip install vaderSentiment

import tweepy
import requests
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/gdrive')
import sys
sys.path.insert(0,'/content/gdrive/My Drive/Colab Notebooks')
import config

import contractions
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from stop_words import get_stop_words
from nltk.tokenize.treebank import TreebankWordDetokenizer
import gensim.downloader as api
from nltk.stem import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweepy
  Downloading tweepy-4.12.1-py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests<3,>=2.27.0
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: requests, tweepy
  Attempting uninstall: requests
    Found existing installation: requests 2.25.1
    Uninstalling requests-2.25.1:
      Successfully uninstalled requests-2.25.1
  Attempting uninstall: tweepy
    Found existing installation: tweepy 3.10.0
    Uninstalling tweepy-3.10.0:
      Successfully uninstalled tweepy-3.10.0
Successfully installed requests-2.28.2 tweepy-4.12.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [2]:
!pip install keras-tuner
!pip install tensorflow-text
!pip install tensorflow
!pip install keras-preprocessing

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Input
from tensorflow.keras.layers import Embedding
from keras import optimizers
import keras_tuner
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.metrics import BinaryAccuracy, Precision, Recall

import tensorflow_hub as hub
import tensorflow_text as text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.2.1-py3-none-any.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.6/169.6 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: kt-legacy, jedi, keras-tuner
Successfully installed jedi-0.18.2 keras-tuner-1.2.1 kt-legacy-1.0.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8

In [3]:
# Mute warnings
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [4]:
# Import twitter data
df3 = pd.read_csv('/content/gdrive/MyDrive/depressed_tweets.csv')
non_depressed = pd.read_csv('/content/gdrive/MyDrive/non_depressed.csv')
rdf2 = pd.read_csv('/content/gdrive/MyDrive/random_tweets.csv')

In [5]:
# Append datasets
rdf2 = rdf2[0:len(df3)-len(non_depressed)] # remove some tweets to balance data
non_depressed_tweets = rdf2[['tweet', 'depressed']].append(non_depressed[['tweet', 'depressed']])
tweets_df = df3[['tweet', 'depressed']].append(non_depressed_tweets)

In [6]:
#Show balanced data
tweets_df.depressed.value_counts()

1    1467
0    1467
Name: depressed, dtype: int64

# Data Preprocessing

The data preprocessing includes removing punctuation, lemmatizing tweets, and removing stopwords. Among the stopwords are the keywords used to identify tweets that indicate depression. 

In [7]:
# Remove punctuation
tweets_df["tweet"] = tweets_df['tweet'].str.replace('[^\w\s]|[0-9]','')
tweets_df.head()

# Tokenization #
tweets_df['tweet'] = tweets_df.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)

# Stem words
lemmatizer = WordNetLemmatizer()

lemmatized_tweets = []
for tweet in tweets_df['tweet']:
  lemmatized_tweet = []
  for word in tweet:
    lemmatized_word = lemmatizer.lemmatize(word)
    lemmatized_tweet.append(lemmatized_word)
  lemmatized_tweets.append(lemmatized_tweet)
tweets_df['tweet'] = lemmatized_tweets

## Stopwords ##

# Combine get_stopwords with NLTK stop words
stop_words = ['feel', "like", 'depressed', 'depression', 'anxiety', 'antidepressant', 'antidepressants', 'feeling', 'felt'] #We used these words to search for tweets, thus we end up removing tweets
stop_words2 = list(get_stop_words('en'))
stop_words.extend(stop_words2)
nltk_stop_words = list(stopwords.words('english'))
stop_words.extend(nltk_stop_words)

# Remove stop words
sentences_list = []
for word_list in tweets_df['tweet']:
  filtered_sentence = []
  for word in word_list:
    if word not in stop_words:
      filtered_sentence.append(word)
  sentences_list.append(filtered_sentence)
tweets_df['tweet'] = sentences_list

  tweets_df["tweet"] = tweets_df['tweet'].str.replace('[^\w\s]|[0-9]','')


In [8]:
# Convert tweets back into text 
detokenized_tweets = []
for tweet in tweets_df.tweet:
  detokenized_tweets.append(TreebankWordDetokenizer().detokenize(tweet))
tweets_df['text'] = detokenized_tweets

In [9]:
#Convert to list
x = tweets_df['text'].to_list()
y = tweets_df['depressed'].to_list()

##Sentiment

In [10]:
# Append negative, neutral, positive, and composite sentiment scores to twitter data
sentiment_analyzer = SentimentIntensityAnalyzer() # Instantiate sentiment
tweets_df['sentiment_score'] = tweets_df['text'].apply(sentiment_analyzer.polarity_scores)
tweets_df = pd.concat([tweets_df.drop(['sentiment_score'], axis=1), tweets_df['sentiment_score'].apply(pd.Series)], axis=1)
tweets_df.head()

Unnamed: 0,tweet,depressed,text,neg,neu,pos,compound
0,"[working, home, today, making, corporate, smal...",1,working home today making corporate small task...,0.0,0.851,0.149,0.2732
1,"[need, one, night, go, skin]",1,need one night go skin,0.0,1.0,0.0,0.0
2,"[think, even, excited, antman, anymore]",1,think even excited antman anymore,0.0,0.625,0.375,0.34
3,"[fucking, extrovert, get, horribly, regular, i...",1,fucking extrovert get horribly regular interac...,0.508,0.492,0.0,-0.6326
4,"[severely, think, thing, help, current, state,...",1,severely think thing help current state mall p...,0.219,0.584,0.197,-0.0772


In [11]:
# Get the average sentiment scores by tweet classification
tweets_df.groupby(['depressed']).mean('compound')

Unnamed: 0_level_0,neg,neu,pos,compound
depressed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.123261,0.693561,0.182497,0.076347
1,0.189804,0.636767,0.170024,-0.019408


# Word Embedding

### Count Vectorizer (Bag of Words)

Takes into account the word frequencies in each tweet

In [12]:
# instantiate and fit/transform on count vectorizer
x_bow = CountVectorizer().fit_transform(x)

# Test and train split
x_train, x_test, y_train, y_test = train_test_split(x_bow, y, stratify = y, random_state=11)

###TF-IDF

Considers the important of a word in a tweet in relation to the total number of times the word appears in


In [13]:
x_tfidf = TfidfVectorizer().fit_transform(x)

# Test and train split
tfidf_x_train, tfidf_x_test, tfidf_y_train, tfidf_y_test = train_test_split(x_tfidf, y, stratify = y, random_state=11)

###GloVe

Uses a matrix factorization technique to account for the co-occurance between words at a global level


In [14]:
# Get word vectorizer from twitter
wv = api.load('glove-twitter-50')



In [15]:
# Function to convert text into vectorized form
def glove_vectorizer(text):
  vector_size = wv.vector_size
  wv_res = np.zeros(vector_size)
  count = 1
  for word in text:
    if word in wv:
      count +=1
      wv_res += wv[word]
  wv_res = wv_res/count
  return wv_res 
    
# Apply function to text
tweets_df['glove'] = tweets_df['text'].apply(glove_vectorizer)
tweets_df.head()

Unnamed: 0,tweet,depressed,text,neg,neu,pos,compound,glove
0,"[working, home, today, making, corporate, smal...",1,working home today making corporate small task...,0.0,0.851,0.149,0.2732,"[0.23994614269870979, 0.08996781630393787, 0.1..."
1,"[need, one, night, go, skin]",1,need one night go skin,0.0,1.0,0.0,0.0,"[0.36984789018568237, -0.17828817096979996, 0...."
2,"[think, even, excited, antman, anymore]",1,think even excited antman anymore,0.0,0.625,0.375,0.34,"[0.27604273209969205, -0.05447451062500477, 0...."
3,"[fucking, extrovert, get, horribly, regular, i...",1,fucking extrovert get horribly regular interac...,0.508,0.492,0.0,-0.6326,"[0.260807118652498, 0.12523734985905535, 0.148..."
4,"[severely, think, thing, help, current, state,...",1,severely think thing help current state mall p...,0.219,0.584,0.197,-0.0772,"[0.32182383696463973, 0.0006920882924036547, 0..."


In [16]:
# Test and train split
x2 = tweets_df['glove'].to_list()
y2 = tweets_df['depressed'].to_list()
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, stratify = y2, random_state=11)

# Logistic Regression

###Count Vectorizer

In [18]:
## Grid Searching ##

#Get CV Score for Logistic Regression
log = LogisticRegression(max_iter = 1000)

params = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty': ['none', 'l1', 'l2']}
grid = GridSearchCV(log, param_grid=params, cv=5)

#use meta model methods to fit score and predict model:
grid.fit(x_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("Logistic Count Vectorizer best mean CV score: {:.3f}".format(grid.best_score_))
print("Logistic Count Vectorizer best parameters: {}".format(grid.best_params_))
print("Logistic Count Vectorizer test-set score: {:.3f}".format(grid.score(x_test, y_test)))

Logistic Count Vectorizer best mean CV score: 0.743
Logistic Count Vectorizer best parameters: {'penalty': 'l2', 'solver': 'lbfgs'}
Logistic Count Vectorizer test-set score: 0.749


20 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for the liblinear so

In [19]:
# Get Score for Logistic Regression
log = LogisticRegression(penalty= 'l2', solver= 'saga')
log.fit(x_train, y_train)

In [20]:
# Full report
log_predicted = log.predict(x_test)
print("Prediction report for Logistic Bag of Words Model ~")
print(classification_report(y_test, log_predicted))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74       367
           1       0.74      0.75      0.74       367

    accuracy                           0.74       734
   macro avg       0.74      0.74      0.74       734
weighted avg       0.74      0.74      0.74       734



###TF-IDF Vectorizer

In [21]:
# Grid Search

log = LogisticRegression(max_iter = 1000)
params = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty': ['none', 'l1', 'l2']}
grid = GridSearchCV(log, param_grid=params, cv=5)

# Use meta model methods to fit score and predict model:
grid.fit(tfidf_x_train, tfidf_y_train)

# Extract best score and parameter by calling objects "best_score_" and "best_params_"
print("Logistic TF IDF best mean CV score: {:.3f}".format(grid.best_score_))
print("Logistic TF IDF best parameters: {}".format(grid.best_params_))
print("Logistic TF IDF test-set score: {:.3f}".format(grid.score(tfidf_x_test, tfidf_y_test)))

Logistic TF IDF best mean CV score: 0.760
Logistic TF IDF best parameters: {'penalty': 'l2', 'solver': 'newton-cg'}
Logistic TF IDF test-set score: 0.770


20 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for the liblinear so

In [22]:
log = LogisticRegression(penalty= 'l2', solver= 'newton-cg')
log.fit(tfidf_x_train, tfidf_y_train)

Logistic TF IDF Vectorizer Prediction Score ->  75.89158345221112


In [23]:
# Full report
log2_predicted = log.predict(tfidf_x_test)
print("Prediction report for Logistic TF IDF Model ~")
print(classification_report(tfidf_y_test, log2_predicted))

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       367
           1       0.80      0.72      0.76       367

    accuracy                           0.77       734
   macro avg       0.77      0.77      0.77       734
weighted avg       0.77      0.77      0.77       734



###GloVe

In [24]:
# Grid Search 
log = LogisticRegression(max_iter = 1000)
params = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty': ['none', 'l1', 'l2']}
grid = GridSearchCV(log, param_grid=params, cv=5)

# Fit score and predict model:
grid.fit(x_train2, y_train2)

# Extract best score and parameter by calling objects "best_score_" and "best_params_"
print("Logistic GloVe best mean CV score: {:.3f}".format(grid.best_score_))
print("Logistic GloVe best parameters: {}".format(grid.best_params_))
print("Logistic GloVe test set score: {:.3f}".format(grid.score(x_test2, y_test2)))

20 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for the liblinear so

Logistic GloVe best mean CV score: 0.586
Logistic GloVe best parameters: {'penalty': 'none', 'solver': 'sag'}
Logistic GloVe test set score: 0.557


In [25]:
log = LogisticRegression(penalty= 'l2', solver= 'lbfgs')
log.fit(x_train2, y_train2)

Logistic Glove Prediction Score ->  52.22381635581061


In [26]:
# Full report
log3_predicted = log.predict(x_test2)
print("Prediction report for Logistic GloVe Model ~")
print(classification_report(y_test2, log3_predicted))

              precision    recall  f1-score   support

           0       0.54      0.60      0.57       367
           1       0.55      0.50      0.52       367

    accuracy                           0.55       734
   macro avg       0.55      0.55      0.55       734
weighted avg       0.55      0.55      0.55       734



# Support Vector Machine

###Count Vectorizer (bag of words)

Grid Searching

In [27]:
# SVM GridSearch 
SVM = svm.SVC()
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [100, 10, 1.0, 0.1, 0.001], 'gamma': [1,0.1,0.01,0.001]}
grid = GridSearchCV(SVM, param_grid=params, cv=5)

# Fit score and predict model:
grid.fit(x_train, y_train)

print("SVM Count Vect best mean CV score: {:.3f}".format(grid.best_score_))
print("SVM Count Vect best parameters: {}".format(grid.best_params_))
print("SVM Count Vect test set score: {:.3f}".format(grid.score(x_test, y_test)))

SVM Count Vect best mean CV score: 0.746
SVM Count Vect best parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
SVM Count Vect test set score: 0.741


In [28]:
# Fit the training dataset and predict on text
SVM = svm.SVC(C = 10, gamma = 0.01, kernel='rbf')
SVM.fit(x_train,y_train)

SVM Count Vectorizer Prediction Score ->  73.01136363636364


In [29]:
# Full report
svm_predicted = SVM.predict(x_test)
print("Classification report for SVM Count Vectorizer Model ~")
print(classification_report(y_test, svm_predicted))

              precision    recall  f1-score   support

           0       0.72      0.78      0.75       367
           1       0.76      0.70      0.73       367

    accuracy                           0.74       734
   macro avg       0.74      0.74      0.74       734
weighted avg       0.74      0.74      0.74       734



###TF-IDF Vectorizer

In [30]:
# TF IDF GridSearch
SVM = svm.SVC()
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [100, 10, 1.0, 0.1, 0.001], 'gamma': [1,0.1,0.01,0.001]}
grid = GridSearchCV(SVM, param_grid=params, cv=5)

# Fit and predict model:
grid.fit(tfidf_x_train, tfidf_y_train)

print("SVM TF IDF best mean CV score: {:.3f}".format(grid.best_score_))
print("SVM TF IDF best parameters: {}".format(grid.best_params_))
print("SVM TF IDF test-set score: {:.3f}".format(grid.score(tfidf_x_test, tfidf_y_test)))

SVM TF IDF best mean CV score: 0.760
SVM TF IDF best parameters: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
SVM TF IDF test-set score: 0.760


In [31]:
# Fit the training dataset on the classifier
SVM = svm.SVC(C = 10, gamma = 1, kernel='rbf')
SVM.fit(tfidf_x_train,tfidf_y_train)

SVM TF-IDF Vectorizer Prediction Score ->  75.2112676056338


In [32]:
# Full report
svm_predicted2 = SVM.predict(tfidf_x_test)
print("Classification report for SVM TF-IDF Model ~")
print(classification_report(tfidf_y_test, svm_predicted2))

              precision    recall  f1-score   support

           0       0.74      0.79      0.77       367
           1       0.78      0.73      0.75       367

    accuracy                           0.76       734
   macro avg       0.76      0.76      0.76       734
weighted avg       0.76      0.76      0.76       734



###Glove

In [33]:
# Glove SVM GridSearch
SVM = svm.SVC()
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [100, 10, 1.0, 0.1, 0.001], 'gamma': [1,0.1,0.01,0.001]}
grid = GridSearchCV(SVM, param_grid=params, cv=5)
grid.fit(x_train2, y_train2)

# Extract best score and parameter by calling objects "best_score_" and "best_params_"
print("SVM best mean CV score: {:.3f}".format(grid.best_score_))
print("SVM best parameters: {}".format(grid.best_params_))
print("SVM test-set score: {:.3f}".format(grid.score(x_test2, y_test2)))

SVM best mean CV score: 0.623
SVM best parameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
SVM test-set score: 0.604


In [35]:
# fit the training dataset on the classifier
SVM = svm.SVC(kernel='linear')
SVM.fit(x_train2, y_train2)
# Predict and score on test
svm_predictions2 = SVM.predict(x_test2)

SVM GloVe Score ->  52.67727930535456


In [36]:
# Full report
print("Classification report for SVM GloVe Model ~")
svm_predicted3 = SVM.predict(x_test2)
print(classification_report(y_test2, svm_predicted3))

              precision    recall  f1-score   support

           0       0.55      0.61      0.58       367
           1       0.56      0.50      0.53       367

    accuracy                           0.55       734
   macro avg       0.56      0.55      0.55       734
weighted avg       0.56      0.55      0.55       734



# Deep Neural Networks

### Keras Embedding

In [14]:
x3 = tweets_df['text']
y3 = tweets_df['depressed']

In [15]:
max_words = 100 # Defines the top n number of words in the dictionary
max_length = 10 # Length of each sequence
 
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(tweets_df['text'])
sequences = tokenizer.texts_to_sequences(tweets_df['text']) # Process the tweets into sequences

x3 = pad_sequences(sequences, maxlen=max_length) # Padding sequences converts characters to the same length

#Test and train split
x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, tweets_df['depressed'], stratify = tweets_df['depressed'], random_state=11)

In [16]:
output_dim = 32

# Build model
model = Sequential() 
model.add(Embedding(input_dim = max_words, output_dim = output_dim, input_length=max_length)) # Dense classification layer
model.add(Flatten()) # Reshape embedding output
model.add(Dense(512, activation='relu')) #128 nodes <------- TUNE THIS
model.add(Dense(1, activation='sigmoid')) # 2 classes

# Fit and compile
metrics = [
    BinaryAccuracy(name='accuracy'),
    Precision(name='precision'),
    Recall(name='recall')]
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=metrics) 

results = model.fit(x_train3, 
                 y_train3, 
                 epochs=20, 
                 batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Neural Net W/3 Hidden Layers + 1 Dropout

In [17]:
hp = keras_tuner.HyperParameters()
metrics = [
    BinaryAccuracy(name='accuracy'),
    Precision(name='precision'),
    Recall(name='recall')]

In [18]:
x3 = tweets_df['text']
y3 = tweets_df['depressed']
 
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x3)
sequences = tokenizer.texts_to_sequences(x3) # Process the tweets into sequences

x3 = pad_sequences(sequences, maxlen=max_length) # Padding sequences converts characters to the same length

#Test and train split
x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, y3, stratify = y3, random_state=11)

#Build model, compile, and fit
output_dim = 32
best_model = Sequential() 
best_model.add(Embedding(input_dim = max_words, output_dim = output_dim, input_length=max_length)) # Dense classification layer
best_model.add(Flatten()) # Reshape embedding output
best_model.add(Dense(units=hp.Choice('num_units', values=[16, 64, 32, 128, 256, 512, 1024, 2048], default=64), 
                activation='relu')) # tuning nodes
best_model.add(Dense(units=hp.Choice('num_units', values=[16, 64, 32, 128, 256, 512, 1024, 2048], default=64), 
                activation='relu'))
best_model.add(Dense(units=hp.Choice('num_units', values=[16, 64, 32, 128, 256, 512, 1024, 2048], default=64), 
                activation='relu'))       
best_model.add(Dropout(0.2))
best_model.add(Dense(1, activation='sigmoid')) # 2 classes

best_model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=metrics) 

results = best_model.fit(x_train3, 
                 y_train3, 
                 epochs=20, 
                 batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


##BERT w/3 hidden layers, 1 dropout layer

Treats the same words differently when in under different contexts

In [25]:
# Preprocess text
bert_preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3" # URL to preprocess text
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4" #URL to encode

bert_preprocess_model = hub.KerasLayer(bert_preprocess_url) # BERT preprocessor
bert_encoder_model = hub.KerasLayer(encoder_url) # BERT encoder

# Initialize input layer using BERT preprocessed text
input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') # input layer using preprocessed text
preprocessed_text = bert_preprocess_model(input)
output = bert_encoder_model(preprocessed_text)
# Initialize NN layers 
l = tf.keras.layers.Dense(units=hp.Choice('num_units', values=[16, 64, 32, 128, 256, 512, 1024, 2048], default=64), 
                activation='relu') # tuning nodes
l = tf.keras.layers.Dense(units=hp.Choice('num_units', values=[16, 64, 32, 128, 256, 512, 1024, 2048], default=64), 
                activation='relu')
l = tf.keras.layers.Dropout(0.2, name="dropout")(output['pooled_output']) # drop nodes to prevent overfitting; input BERT layer
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

model = tf.keras.Model(inputs=[input], outputs = [l])

model.summary()

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [26]:
# Split data
x_train4, x_test4, y_train4, y_test4 = train_test_split(tweets_df['text'], tweets_df['depressed'], stratify = tweets_df['depressed'], random_state=11)

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=metrics)

results = model.fit(x_train4, 
                 y_train4, 
                 epochs=20, 
                 batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
