#Sentiment Analysis


In [None]:
#import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import string

In [None]:
df= pd.read_csv('mental_health.csv')

In [None]:
df.head(5)

Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1


## Data Cleaning

In [None]:
#check null values
df.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
# check duplicates
df.duplicated().sum()

5

In [None]:
#drop duplicates
df.drop_duplicates(inplace=True)

Spelling mistakes can be corrected. left for later if acuracy is low

In [None]:
def strip_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese characters
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(text):
    new_text = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', text)) #remove last hashtags
    new_text2 = " ".join(word.strip() for word in re.split('#|_', new_text)) #remove hashtags symbol from words in the middle of the sentence
    return new_text2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [None]:
df['text1'] = (df['text']
                     .apply(strip_emoji)
                     .apply(strip_all_entities)
                     .apply(clean_hashtags)
                     .apply(filter_chars)
                     .apply(remove_mult_spaces))

## Data Preprocessing

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stopwords = set(['im'] + stopwords.words('english'))
def remove_stopwords(sentence):


    # Sentence converted to lowercase-only
    sentence = sentence.lower()

    words = sentence.split()
    no_words = [w for w in words if w not in stopwords]
    sentence = " ".join(no_words)

    return sentence

In [None]:
df['text1'] = df['text1'].apply(remove_stopwords)

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['text1'] = df['text1'].apply(lemmatizer.lemmatize)

In [None]:
#tokenize
import nltk
nltk.download("punkt")
nltk.download("wordnet")
#df['text1'] = df['text1'].apply(nltk.word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df.head(5)

Unnamed: 0,text,label,text1
0,dear american teens question dutch person hear...,0,dear american teens question dutch person hear...
1,nothing look forward lifei dont many reasons k...,1,nothing look forward lifei dont many reasons k...
2,music recommendations im looking expand playli...,0,music recommendations looking expand playlist ...
3,im done trying feel betterthe reason im still ...,1,done trying feel betterthe reason still alive ...
4,worried year old girl subject domestic physic...,1,worried year old girl subject domestic physica...


In [None]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=15)

## Results

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import f1_score

def printResults(y_true, y_predicted):
  print("Accuracy= ", accuracy_score(y_true, y_predicted))

  columns=['false', 'true']

  ConfusionMatrixDisplay(confusion_matrix(y_true, y_predicted))

  precision, recall, fscore, support = score(y_true, y_predicted)

  print('###########################################')
  print('precision: {}'.format(precision))
  print('recall: {}'.format(recall))
  print('fscore: {}'.format(fscore))
  print('support: {}'.format(support))
  print('###########################################3')

  print('Macro F1 ',f1_score(y_true, y_predicted, average='macro'))

  print('Micro F1 ', f1_score(y_true, y_predicted, average='micro'))


## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf = TfidfVectorizer()

In [None]:
tf_x_train = tf.fit_transform(X_train)
tf_x_test = tf.transform(X_test)

In [None]:
X_train.shape

(25174,)

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

clfRF = RandomForestClassifier(n_estimators = 1000)
# Train the model on training data
clfRF.fit(tf_x_train, y_train);

In [None]:
testLabelsPredicted=list(clfRF.predict(tf_x_test))
printResults(testLabelsPredicted, y_test)

Accuracy=  0.8963545389563974
###########################################
precision: [0.88170564 0.91220238]
recall: [0.91571429 0.8769671 ]
fscore: [0.89838823 0.89423778]
support: [1400 1398]
###########################################3
Macro F1  0.8963130048450816
Micro F1  0.8963545389563973


In [None]:
new = ['I am using too  much social media']
pred = tf.transform(new)
clfRF.predict(pred)

array([0])

In [None]:
new = ["""iterally staying alive one thing onlythe reason
im still dog cant leave much want every single day feel like owe stick around take care god guarantee someone love take care way id gone instant"""]

pred = tf.transform(new)
clfRF.predict(pred)

array([1])