In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
%%capture
import numpy as np
import pandas as pd

# Importing sklearn library for Classification Model, Cross Validation, Text Features Extraction and Ranking of best features.
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate
#from sklearn.externals import joblib 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from  sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer

# Importing nltk library for Sentiment score and POS tagging 
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus.reader import TaggedCorpusReader 
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')
from nltk.corpus import stopwords

import re

# For plotting graphs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

import string

import statsmodels.api as sm


In [43]:
dataframe = pd.read_excel('/content/drive/MyDrive/Mental_health_prediction/Data_preparation/Dataset-raw.xlsx')

print(dataframe.columns)
happy = dataframe[dataframe.label == 1]
angry = dataframe[dataframe.label == 2]
depressed = dataframe[dataframe.label == 3]
fear = dataframe[dataframe.label == 4]

happy = happy.iloc[0:2100, ]
angry = angry.iloc[0:2100, ]
depressed = depressed.iloc[0:2100, ]
fear = fear.iloc[0:2100, ]

frames = [ happy, angry,depressed, fear]

dataframe = pd.concat(frames)
dataframe = shuffle(dataframe)
dataframe.reset_index(inplace=True, drop=True)
print(dataframe.head())


Index(['alt', 'en', 'firstComment', 'imageUrl', 'likesCount', 'locationName',
       'ownerUsername', 'timestamp', 'url', 'faceCount', 'H', 'S', 'V',
       'label', 'followersCount', 'followsCount', 'bio', 'postsCount', 'lc/fc',
       'fc/foc', 'animal', 'in-outdoor', 'text', 'people', 'other'],
      dtype='object')
                                                 alt  en  ... people other
0                           Image may contain: shoes  en  ...      0   NaN
1                            Image may contain: food  en  ...      0   NaN
2                    No photo description available.  en  ...      0   NaN
3  Image may contain: one or more people, people ...  en  ...      1   0.0
4                    No photo description available.  en  ...      0   NaN

[5 rows x 25 columns]


In [44]:
# dropping column
col = ['locationName', 'ownerUsername', 'alt', 'en',  'imageUrl', 'timestamp', 'url', "bio"]
dataframe = dataframe.drop(col, axis=1)

cols = [ 'label','firstComment', 'textWeight', 'H', 'S','V', 'followersCount', 'fc/foc', 'likesCount',
        'faceCount',  'followsCount', 'postsCount', 'lc/fc', 'in-outdoor']
dataframe = dataframe.reindex(columns = cols)

#print(dataframe.head(3))
print(dataframe.columns)


Index(['label', 'firstComment', 'textWeight', 'H', 'S', 'V', 'followersCount',
       'fc/foc', 'likesCount', 'faceCount', 'followsCount', 'postsCount',
       'lc/fc', 'in-outdoor'],
      dtype='object')


In [45]:
dataframe['S']=dataframe['S'].map(lambda x: x*100)

In [47]:
#text weight from tf-idf
v = TfidfVectorizer(analyzer=lambda d: d.split(', ')).fit(dataframe['firstComment'])
tfidf = v.fit_transform(dataframe['firstComment'])
#print(v.get_feature_names())
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(tfidf)
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': v.get_feature_names(), 'weight': weights})

dataframe['textWeight'] = weights_df.weight
print(dataframe.head())
#.sort_values(by='weight', ascending=True)
print(weights_df.head())

   label  ... in-outdoor
0      1  ...          0
1      3  ...          0
2      3  ...          0
3      4  ...          2
4      3  ...          0

[5 rows x 14 columns]
                                                term    weight
0                                                     0.000144
1  \nBut this insecurities will creep in. \nI'm r...  0.000042
2  \nHow could you downgrade a signature of Kokot...  0.000042
3  \nMemories are made. \nMemories are worth more...  0.000060
4  \nNAKED AND NOT ASHAMED.\n\nA must read for ev...  0.000049


In [48]:
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


In [49]:
#removal of stop words, punctutions, emoji
dataframe["text_lower"] = dataframe["firstComment"].str.lower()
PUNCT_TO_REMOVE = string.punctuation
dataframe["text_wo_punct"] = dataframe["text_lower"].apply(lambda text: remove_punctuation(text))
#print(dataframe.head())

", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
dataframe["text_wo_stop"] = dataframe["text_wo_punct"].apply(lambda text: remove_stopwords(text))
#print(dataframe.head())

dataframe["text_wo_emoji"]=dataframe["text_wo_stop"].apply(lambda text: remove_emoji(text))
print(dataframe.head())


   label  ...                                      text_wo_emoji
0      1  ...  got get fake adidas holiday danang vietnam fam...
1      3  ...  crap disgusting ingredients tomatoe sauce pine...
2      3  ...  trash doggo strikes took dogs run around large...
3      4  ...  disney spooky season • • • disneyland disney d...
4      3  ...  beer labels everyone handlettered handletterin...

[5 rows x 18 columns]


In [50]:
#POS Tagging
for index,row in dataframe.iterrows():
      parag=dataframe.loc[index,'text_wo_emoji']
      para=nltk.tokenize.word_tokenize(parag)
      tagged = nltk.pos_tag(para)
      adverb=0
      adjective=0
      verb=0
      noun=0
      mylist=[]
      for i in range(len(tagged)):
        if tagged[i][1]=='NN' or  tagged[i][1]=='NNS' or tagged[i][1]=='NNP' or tagged[i][1]=='NNPS':
          noun=noun+1
        elif tagged[i][1]=='VB' or tagged[i][1]=='VBD' or tagged[i][1]=='VBN' or tagged[i][1]=='VBP' or tagged[i][1]=='VBZ' or tagged[i][1]=='VBG':
          verb=verb+1
        elif tagged[i][1]=='JJ' or tagged[i][1]=='JJR' or tagged[i][1]=='JJS':
          adjective=adjective+1
        elif tagged[i][1]=='RB' or tagged[i][1]=='RBR' or tagged[i][1]=='RBS':
          adverb=adverb+1
        else:
          pass
      dataframe.loc[index,'noun']=noun
      dataframe.loc[index,'verb']=verb
      dataframe.loc[index,'adjective']=adjective
      dataframe.loc[index,'adverb']=adverb
   
dataframe.head()

Unnamed: 0,label,firstComment,textWeight,H,S,V,followersCount,fc/foc,likesCount,faceCount,followsCount,postsCount,lc/fc,in-outdoor,text_lower,text_wo_punct,text_wo_stop,text_wo_emoji,noun,verb,adjective,adverb
0,1,Got to get me some fake Adidas on holiday #dan...,0.000144,115.294434,25.465935,56.902005,248.0,1.059829,27,0,234.0,20.0,0.108871,0,got to get me some fake adidas on holiday #dan...,got to get me some fake adidas on holiday dana...,got get fake adidas holiday danang vietnam fam...,got get fake adidas holiday danang vietnam fam...,7.0,2.0,3.0,0.0
1,3,This crap is disgusting Ingredients: Tomatoe ...,4.2e-05,91.174774,46.178553,60.101157,51.0,5.666667,3,1,9.0,5.0,0.058824,0,this crap is disgusting ingredients: tomatoe ...,this crap is disgusting ingredients tomatoe s...,crap disgusting ingredients tomatoe sauce pine...,crap disgusting ingredients tomatoe sauce pine...,8.0,5.0,3.0,0.0
2,3,Trash doggo strikes again!\n\nI took the dogs ...,4.2e-05,245.955231,33.422497,20.834526,661.0,1.188849,14,1,556.0,620.0,0.02118,0,trash doggo strikes again!\n\ni took the dogs ...,trash doggo strikes again\n\ni took the dogs t...,trash doggo strikes took dogs run around large...,trash doggo strikes took dogs run around large...,64.0,26.0,24.0,5.0
3,4,Disney at spooky season \n•\n•\n•\n#disneyland...,6e-05,121.398659,21.388584,70.055434,422.0,1.661417,33,0,254.0,762.0,0.078199,2,disney at spooky season \n•\n•\n•\n#disneyland...,disney at spooky season \n•\n•\n•\ndisneyland ...,disney spooky season • • • disneyland disney d...,disney spooky season • • • disneyland disney d...,14.0,2.0,2.0,0.0
4,3,Beer labels for everyone!\n\n#handlettered #ha...,4.9e-05,123.905922,16.080843,69.243296,59.0,0.808219,10,0,73.0,33.0,0.169492,0,beer labels for everyone!\n\n#handlettered #ha...,beer labels for everyone\n\nhandlettered handl...,beer labels everyone handlettered handletterin...,beer labels everyone handlettered handletterin...,7.0,2.0,0.0,0.0


In [51]:
#sentiment analyser for analysing positivity score and negativity score
sid = SentimentIntensityAnalyzer()
for index,row in dataframe.iterrows():
  parag=dataframe.loc[index,'text_wo_emoji']
  scores = sid.polarity_scores(parag)
  dataframe.loc[index,'compound']=scores['compound']
  dataframe.loc[index,'neg']=scores['neg']
  dataframe.loc[index,'neu']=scores['neu']
  dataframe.loc[index,'pos']=scores['pos']
print(dataframe.head())

   label                                       firstComment  ...    neu    pos
0      1  Got to get me some fake Adidas on holiday #dan...  ...  0.633  0.171
1      3  This crap is disgusting  Ingredients: Tomatoe ...  ...  0.539  0.000
2      3  Trash doggo strikes again!\n\nI took the dogs ...  ...  0.794  0.130
3      4  Disney at spooky season \n•\n•\n•\n#disneyland...  ...  0.778  0.000
4      3  Beer labels for everyone!\n\n#handlettered #ha...  ...  0.677  0.000

[5 rows x 26 columns]


In [52]:
#dropping superfluous columns
dataframe = dataframe.drop([ 'text_lower', 'text_wo_punct', 'text_wo_stop'], axis=1)
#print(dataframe.head())
dataframe.head()

Unnamed: 0,label,firstComment,textWeight,H,S,V,followersCount,fc/foc,likesCount,faceCount,followsCount,postsCount,lc/fc,in-outdoor,text_wo_emoji,noun,verb,adjective,adverb,compound,neg,neu,pos
0,1,Got to get me some fake Adidas on holiday #dan...,0.000144,115.294434,25.465935,56.902005,248.0,1.059829,27,0,234.0,20.0,0.108871,0,got get fake adidas holiday danang vietnam fam...,7.0,2.0,3.0,0.0,-0.1027,0.196,0.633,0.171
1,3,This crap is disgusting Ingredients: Tomatoe ...,4.2e-05,91.174774,46.178553,60.101157,51.0,5.666667,3,1,9.0,5.0,0.058824,0,crap disgusting ingredients tomatoe sauce pine...,8.0,5.0,3.0,0.0,-0.8555,0.461,0.539,0.0
2,3,Trash doggo strikes again!\n\nI took the dogs ...,4.2e-05,245.955231,33.422497,20.834526,661.0,1.188849,14,1,556.0,620.0,0.02118,0,trash doggo strikes took dogs run around large...,64.0,26.0,24.0,5.0,0.8316,0.075,0.794,0.13
3,4,Disney at spooky season \n•\n•\n•\n#disneyland...,6e-05,121.398659,21.388584,70.055434,422.0,1.661417,33,0,254.0,762.0,0.078199,2,disney spooky season • • • disneyland disney d...,14.0,2.0,2.0,0.0,-0.6124,0.222,0.778,0.0
4,3,Beer labels for everyone!\n\n#handlettered #ha...,4.9e-05,123.905922,16.080843,69.243296,59.0,0.808219,10,0,73.0,33.0,0.169492,0,beer labels everyone handlettered handletterin...,7.0,2.0,0.0,0.0,-0.6486,0.323,0.677,0.0


In [53]:
dataframe.to_excel('/content/drive/MyDrive/Mental_health_prediction/Data_preparation/DatasetFinal.xlsx', sheet_name='sheet1', index = False)