##Preparation

In [1]:
EnglishShortened_path = 'content/EnglishShortened.csv'

sentimental_depressed_path = 'content/sentimental_depressed.csv'
sentimental_non_depressed_path = 'content/sentimental_non_depressed.csv'

non_depressed_tweets_path = 'content/non_depressed_tweets.csv'
depressed_tweets_path = 'content/depressed_tweets.csv'

Import all necessary libraries

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to /home/eclipse/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eclipse/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /home/eclipse/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eclipse/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/eclipse/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/eclipse/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
import pandas as pd

Function to clean tweet (remove all punctuations, abbreviations of words, capital letters, etc.)

In [4]:
import re
import nltk

def clean_tweet(tweet):
  tweet = re.sub(r"\d+", "", tweet)
  
  tweet = tweet.replace(".", "")
  tweet = tweet.replace("(", "")
  tweet = tweet.replace(")", "")
  tweet = tweet.replace("'m", " am")
  tweet = tweet.replace("'s", " is")
  tweet = tweet.replace("'ve", " have")
  tweet = tweet.replace("n't", " not")
  tweet = tweet.replace("'re", " are")
  tweet = tweet.replace("'d", " would")
  tweet = tweet.replace("'ll", " will")
  tweet = tweet.replace("\r", " ")
  tweet = tweet.replace("\n", " ")
  tweet = tweet.strip().lower()
  
  tweet = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", tweet)
  return tweet

##Metric functions

In [5]:
import re
import nltk

def find_words_frequency(tweet, words_dict):
  tweet_clean = clean_tweet(tweet)
  token = re.findall('\w+', tweet_clean)
  words_freq = nltk.FreqDist(token)
  total_freq = 0
  for item in words_dict:
    item_freq = words_freq.freq(item)
    total_freq += item_freq
  return total_freq

Amount of absolutist words

In [6]:
def absolutist_words_metric(tweet):
  tweet_clean = clean_tweet(tweet)
  absolutist_words_dict = ["absolutely", "all", "always", "complete", "completely", "constant", "constantly", "definitely", "entire",
                           "ever", "every", "everyone", "everything", "full", "must", "never", "nothing", "totally", "whole"]
  return find_words_frequency(tweet_clean, absolutist_words_dict)

Amount of first-person pronouns (increases with depression) + amount of second & third pronouns (decreases with depression) 

In [7]:
def first_pronouns_metric(tweet):
  tweet_clean = clean_tweet(tweet)
  first_person_pronoun_dict = ["i", "me", "my", "mine", "we", "us", "our", "ours"]
  return find_words_frequency(tweet_clean, first_person_pronoun_dict)

def second_third_pronouns_metric(tweet):
  tweet_clean = clean_tweet(tweet)
  second_third_person_pronoun_dict = ["you", "your", "yours",
                                      "he", "she", "it", "him", "her", "his", "its", "hers",
                                      "they", "them", "their", "theirs"]
  return find_words_frequency(tweet_clean, second_third_person_pronoun_dict)

Polarity + subjectivity

In [8]:
from textblob import TextBlob

def TextBlob_metrics(tweet):
  tweet_clean = clean_tweet(tweet)
  blob = TextBlob(tweet_clean)
  for sentence in blob.sentences:
    polarity = sentence.sentiment.polarity
    subjectivity = sentence.sentiment.subjectivity
  return polarity, subjectivity

Level of emotions

In [9]:
import csv

def create_anew_dict():
  anew_dict = {}
  with open(EnglishShortened_path, mode='r') as infile:
    reader = csv.reader(infile)
    anew_dict = {rows[0]:[rows[1], rows[2], rows[3]] for rows in reader}
  return anew_dict

In [10]:
from nltk.stem import WordNetLemmatizer

def lemmatize_tweet(tweet_clean):
  token = re.findall('\w+', tweet_clean)
  lemmatized_words = []
  lemmatizer = WordNetLemmatizer()
  for word in token:
    lemmatized_words.append(lemmatizer.lemmatize(word))
  return lemmatized_words

In [11]:
def anew_metric(tweet):
  anew_dict = create_anew_dict()
  valence, arousal, dominance = 0, 0, 0
   
  tweet_clean = clean_tweet(tweet)
  tweet_words = lemmatize_tweet(tweet_clean)
  N_words_total  = len(tweet_words) 
   
  for index in range(N_words_total):
    # check for negation in 3 words before current word
    j = index-1
    neg = False
    while j >= 0 and j >= index-3:
      if tweet_words[j] == 'not' or tweet_words[j] == 'no':
        neg = True
        break
      j -= 1

    # search for lemmatized word in ANEW
    if tweet_words[index] in anew_dict.keys():
      if neg:
        valence += float(anew_dict[tweet_words[index]][0])
        arousal += float(anew_dict[tweet_words[index]][1])
        dominance +=  float(anew_dict[tweet_words[index]][2])
      else:
        valence += (10 - float(anew_dict[tweet_words[index]][0]))
        arousal += (10 - float(anew_dict[tweet_words[index]][1]))
        dominance += (10 - float(anew_dict[tweet_words[index]][2]))

  if N_words_total == 0:
    return 0, 0, 0
  else:
    return valence/N_words_total, arousal/N_words_total, dominance/N_words_total

Ratio of pronouns to nouns, Ratio of verbs to nouns, Ratio of the number of punctuation to the number of sentences

In [12]:
import nltk
import re

def pronominalisation_index(tweet):
  tweet_clean = clean_tweet(tweet)
  tokens = re.findall('\w+', tweet_clean)
  tags = nltk.pos_tag(tokens, tagset='universal')
  tags_freq = nltk.FreqDist(tag for (word, tag) in tags)
  if tags_freq['NOUN'] == 0:
    return 0
  else:
    return tags_freq['PRON']/tags_freq['NOUN']

def readiness_to_action_index(tweet):
  tweet_clean = clean_tweet(tweet)
  tokens = re.findall('\w+', tweet_clean)
  tags = nltk.pos_tag(tokens, tagset='universal')
  tags_freq = nltk.FreqDist(tag for (word, tag) in tags)
  if tags_freq['NOUN'] == 0:
    return 0
  else:
    return tags_freq["VERB"]/tags_freq["NOUN"]

from nltk.tokenize import sent_tokenize
import string

def punctuation_metric(tweet):
  count = lambda l1,l2: sum([1 for x in l1 if x in l2])
  num_punct = count(tweet,set(string.punctuation))          
  num_sentences = len(sent_tokenize(tweet))
  if num_sentences == 0:
    return 0
  else:
    return num_punct/num_sentences

Get vector based on all metrics

In [13]:
def get_tweet_vector(tweet):
  weight_abolutist_metric = 4
  weight_pronouns_metric = 2
  weight_textblob = 1
  weight_anew_metric = 1
  weight_pos_metric = 1

  absolutist = [absolutist_words_metric(tweet)]*weight_abolutist_metric
  pronouns = [first_pronouns_metric(tweet)]*weight_pronouns_metric + [second_third_pronouns_metric(tweet)]*weight_pronouns_metric
  textblob = [TextBlob_metrics(tweet)[0]]*weight_textblob + [TextBlob_metrics(tweet)[1]]*weight_textblob
  anew = [anew_metric(tweet)[0]]*weight_anew_metric + [anew_metric(tweet)[1]]*weight_anew_metric + [anew_metric(tweet)[2]]*weight_anew_metric
  pos = [pronominalisation_index(tweet)]*weight_pos_metric + [readiness_to_action_index(tweet)]*weight_pos_metric + [punctuation_metric(tweet)]*weight_pos_metric

  tweet_vector = absolutist + pronouns + textblob + anew + pos
  
  return tweet_vector               

In [14]:
import datetime
import ast

def get_tweet_data_vector(tweet_vector, timestamp, likes, retweets, sentimental):
  timestamp = timestamp[:19]
  timestamp = datetime.datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  time_vector =  [timestamp.month, timestamp.day, timestamp.hour, timestamp.minute]

  tweet_vector = ast.literal_eval(str(tweet_vector))
  
  metric = tweet_vector + time_vector + [likes, retweets] + [sentimental]*5
  return metric

##Data preparation for ML

In [15]:
#read files
sentimental_depressed = pd.read_csv(sentimental_depressed_path)
sentimental_non_depressed = pd.read_csv(sentimental_non_depressed_path)

depressed_tweets = pd.read_csv(depressed_tweets_path)
non_depressed_tweets = pd.read_csv(non_depressed_tweets_path)

In [16]:
depressed_tweets['tweet_vector'] = depressed_tweets['tweet'].apply(get_tweet_vector)
non_depressed_tweets['tweet_vector'] = non_depressed_tweets['tweet'].apply(get_tweet_vector)

In [26]:
#add sentimental_value column to data frames
depressed_tweets['sentimental_value'] = sentimental_depressed['0']
non_depressed_tweets['sentimental_value'] = sentimental_non_depressed
#create status column to separate depressed and non-depressed cases
depressed_tweets['status'] = 1
non_depressed_tweets['status'] = 0

In [28]:
#merge dataframes with depressed and non depressed tweets
all_tweets = depressed_tweets.append(non_depressed_tweets, ignore_index=True)

  all_tweets = depressed_tweets.append(non_depressed_tweets, ignore_index=True)


In [29]:
#create vector for each tweet based on metrics + timestamp + sentimental analysis + amount of likes and retweets
all_tweets['tweet_vector'] = all_tweets.apply(lambda x: get_tweet_data_vector(x.tweet_vector, x.timestamp, x.favorite_count, x.retweet_count, x.sentimental_value), axis=1)

In [30]:
#cretating several columns for tweet vector values
all_tweets['tweet_vector'] = all_tweets.apply(lambda x: ast.literal_eval(str(x.tweet_vector)), axis='columns')
all_tweets = all_tweets.join(all_tweets['tweet_vector'].apply(pd.Series))

In [31]:
#calculate average for one user tweets
all_tweets_average = pd.DataFrame(all_tweets.groupby('userID').mean())

##Classification Model

Prepare dataset for classification model

In [32]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import ast

In [33]:
X  = all_tweets_average[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]]
y = all_tweets_average['status']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, shuffle=True)

Model

In [35]:
rfc = RandomForestClassifier(random_state=0, max_features='auto', n_estimators=500, max_depth=8, criterion='entropy')

In [36]:
rfc.fit(X_train, y_train)
y_pred= rfc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest Classifier is %s"%acc)

  warn(


Accuracy of Random Forest Classifier is 0.978515625


In [37]:
cross_validate(rfc, X, y, scoring='accuracy')

  warn(
  warn(
  warn(
  warn(
  warn(


{'fit_time': array([2.06772327, 2.10412908, 2.08652258, 2.06098294, 2.06627655]),
 'score_time': array([0.05565143, 0.05115366, 0.04776049, 0.05018139, 0.04881406]),
 'test_score': array([0.98046875, 0.96484375, 0.96868885, 0.9667319 , 0.98238748])}

In [38]:
rfc.predict_proba(X_test)

array([[9.94085324e-01, 5.91467638e-03],
       [9.99940573e-01, 5.94268477e-05],
       [8.35223394e-01, 1.64776606e-01],
       ...,
       [9.97875356e-01, 2.12464417e-03],
       [9.79178588e-01, 2.08214125e-02],
       [1.00680950e-01, 8.99319050e-01]])

In [2]:
import pickle

In [1]:
with open('../web/model/model_pkl', 'wb') as files:
    pickle.dump(rfc, files)

NameError: name 'pickle' is not defined

#One test user scenario

As input we have dataframe with 20 rows (tweets) and next columns: userID, lang (only english (en)), tweet, timestamp, favorite_count, retweet_count.

We need to do implement sentimental analysis for this tweets and get another dataframe with 1 column and value of sentimental analysis result

In [3]:
user_tweets_path = "/content/jesseayye.csv"
sentimental_path = "/content/sent_jesseayye.csv"

tweets = pd.read_csv(user_tweets_path)
sentimental = pd.read_csv(sentimental_path)

NameError: name 'pd' is not defined

Calculate metrics for each tweet

In [None]:
#get column with list of metrics
tweets['tweet_vector'] = tweets['tweet'].apply(get_tweet_vector)

In [None]:
#add sentimental analysis value
tweets['sentimental_value'] = sentimental['0']

In [None]:
#get vector based on metric vector + timestamp, sentimental analysis, likes and retweets
tweets['tweet_vector'] = tweets.apply(lambda x: get_tweet_data_vector(x.tweet_vector, x.timestamp, x.favorite_count, x.retweet_count, x.sentimental_value), axis=1)

In [None]:
#cretae several columns for each vector value
tweets['tweet_vector'] = tweets.apply(lambda x: ast.literal_eval(str(x.tweet_vector)), axis='columns')
tweets = tweets.join(tweets['tweet_vector'].apply(pd.Series))

In [None]:
#count average values for this user
tweets_average = pd.DataFrame(tweets.groupby('userID').mean())

Fit classifier

In [None]:
X = tweets_average[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]]

In [None]:
depr_prob = rfc.predict_proba(X)

In [None]:
print("Depression probability: " + str(round(depr_prob[0][1]*100, 2)) + "%")

Depression probability: 0.53%
