In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from textblob import TextBlob # text processing
from textblob import Blobber
import nltk
nltk.download('stopwords') # Download stopwords list
from nltk.corpus import stopwords

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/depression-reddit-cleaned/depression_dataset_reddit_cleaned.csv


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Notebook adapted from NLP Depression classifier (local Kaggle import of data).

In [2]:
data = pd.read_csv("/kaggle/input/depression-reddit-cleaned/depression_dataset_reddit_cleaned.csv")
df = data.copy()
df.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


#### **Get column integer location**

In [3]:
df.columns.get_loc('clean_text')

0

### **Stop Words**  
Although the data set is relatively clean, we can apply stop words for further processing (words that do not count in linguistic analysis). The most common SEO stop words are pronouns, articles, prepositions, and conjunctions.

In [4]:
# datacheck for stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
type(stopwords)

nltk.corpus.reader.wordlist.WordListCorpusReader

In [6]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
stop = stopwords.words('english')
df['aaj_clean_text'] = df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df.head()

Unnamed: 0,clean_text,is_depression,aaj_clean_text
0,we understand that most people who reply immed...,1,understand people reply immediately op invitat...
1,welcome to r depression s check in post a plac...,1,welcome r depression check post place take mom...
2,anyone else instead of sleeping more when depr...,1,anyone else instead sleeping depressed stay ni...
3,i ve kind of stuffed around a lot in my life d...,1,kind stuffed around lot life delaying inevitab...
4,sleep is my greatest and most comforting escap...,1,sleep greatest comforting escape whenever wake...


In [7]:
# get new coloumn location
df.columns.get_loc('aaj_clean_text')

2

### **Add Sentiment**

In [8]:
def add_sentiment_to_df(df):
    sentiment_tuple = []
    
    for x in range(0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][2])
        measures = QuantTextBlob.sentiment
        sentiment_tuple.append(measures)
    df['textScore'] = sentiment_tuple
    return df

In [9]:
add_sentiment_to_df(df)

Unnamed: 0,clean_text,is_depression,aaj_clean_text,textScore
0,we understand that most people who reply immed...,1,understand people reply immediately op invitat...,"(0.08423097251585628, 0.47574083861874544)"
1,welcome to r depression s check in post a plac...,1,welcome r depression check post place take mom...,"(-0.03949074074074075, 0.5401388888888888)"
2,anyone else instead of sleeping more when depr...,1,anyone else instead sleeping depressed stay ni...,"(0.09444444444444444, 0.18888888888888888)"
3,i ve kind of stuffed around a lot in my life d...,1,kind stuffed around lot life delaying inevitab...,"(0.205, 0.7)"
4,sleep is my greatest and most comforting escap...,1,sleep greatest comforting escape whenever wake...,"(0.1630952380952381, 0.35793650793650794)"
...,...,...,...,...
7726,is that snow,0,snow,"(0.0, 0.0)"
7727,moulin rouge mad me cry once again,0,moulin rouge mad cry,"(-0.625, 1.0)"
7728,trying to shout but can t find people on the list,0,trying shout find people list,"(0.0, 0.0)"
7729,ughh can t find my red sox hat got ta wear thi...,0,ughh find red sox hat got ta wear creepy nick ...,"(-0.25, 0.5)"


In [10]:
print(df['aaj_clean_text'][2])

anyone else instead sleeping depressed stay night avoid next day coming sooner may social anxiety life much peaceful everyone else asleep expecting thing


### **Get POLARITY** [-1.0,1.0]

In [11]:
def add_polarity_to_df(df):
    polarity_list = []
    
    for x in range (0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][2])
        measures = QuantTextBlob.sentiment.polarity
        polarity_list.append(measures)
    df['textPolarity'] = polarity_list
    return df

In [12]:
add_polarity_to_df(df)

Unnamed: 0,clean_text,is_depression,aaj_clean_text,textScore,textPolarity
0,we understand that most people who reply immed...,1,understand people reply immediately op invitat...,"(0.08423097251585628, 0.47574083861874544)",0.084231
1,welcome to r depression s check in post a plac...,1,welcome r depression check post place take mom...,"(-0.03949074074074075, 0.5401388888888888)",-0.039491
2,anyone else instead of sleeping more when depr...,1,anyone else instead sleeping depressed stay ni...,"(0.09444444444444444, 0.18888888888888888)",0.094444
3,i ve kind of stuffed around a lot in my life d...,1,kind stuffed around lot life delaying inevitab...,"(0.205, 0.7)",0.205000
4,sleep is my greatest and most comforting escap...,1,sleep greatest comforting escape whenever wake...,"(0.1630952380952381, 0.35793650793650794)",0.163095
...,...,...,...,...,...
7726,is that snow,0,snow,"(0.0, 0.0)",0.000000
7727,moulin rouge mad me cry once again,0,moulin rouge mad cry,"(-0.625, 1.0)",-0.625000
7728,trying to shout but can t find people on the list,0,trying shout find people list,"(0.0, 0.0)",0.000000
7729,ughh can t find my red sox hat got ta wear thi...,0,ughh find red sox hat got ta wear creepy nick ...,"(-0.25, 0.5)",-0.250000


In [13]:
# not getting an error but worked previously before applying stopwords

def getPolarity(score):
    if score < 0: 
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
        
    df['Polarity'] = df['textPolarity'].apply(getPolarity)
    return df

In [14]:
df

Unnamed: 0,clean_text,is_depression,aaj_clean_text,textScore,textPolarity
0,we understand that most people who reply immed...,1,understand people reply immediately op invitat...,"(0.08423097251585628, 0.47574083861874544)",0.084231
1,welcome to r depression s check in post a plac...,1,welcome r depression check post place take mom...,"(-0.03949074074074075, 0.5401388888888888)",-0.039491
2,anyone else instead of sleeping more when depr...,1,anyone else instead sleeping depressed stay ni...,"(0.09444444444444444, 0.18888888888888888)",0.094444
3,i ve kind of stuffed around a lot in my life d...,1,kind stuffed around lot life delaying inevitab...,"(0.205, 0.7)",0.205000
4,sleep is my greatest and most comforting escap...,1,sleep greatest comforting escape whenever wake...,"(0.1630952380952381, 0.35793650793650794)",0.163095
...,...,...,...,...,...
7726,is that snow,0,snow,"(0.0, 0.0)",0.000000
7727,moulin rouge mad me cry once again,0,moulin rouge mad cry,"(-0.625, 1.0)",-0.625000
7728,trying to shout but can t find people on the list,0,trying shout find people list,"(0.0, 0.0)",0.000000
7729,ughh can t find my red sox hat got ta wear thi...,0,ughh find red sox hat got ta wear creepy nick ...,"(-0.25, 0.5)",-0.250000


In [15]:
df.head()

Unnamed: 0,clean_text,is_depression,aaj_clean_text,textScore,textPolarity
0,we understand that most people who reply immed...,1,understand people reply immediately op invitat...,"(0.08423097251585628, 0.47574083861874544)",0.084231
1,welcome to r depression s check in post a plac...,1,welcome r depression check post place take mom...,"(-0.03949074074074075, 0.5401388888888888)",-0.039491
2,anyone else instead of sleeping more when depr...,1,anyone else instead sleeping depressed stay ni...,"(0.09444444444444444, 0.18888888888888888)",0.094444
3,i ve kind of stuffed around a lot in my life d...,1,kind stuffed around lot life delaying inevitab...,"(0.205, 0.7)",0.205
4,sleep is my greatest and most comforting escap...,1,sleep greatest comforting escape whenever wake...,"(0.1630952380952381, 0.35793650793650794)",0.163095


#### Export

In [16]:
df.to_csv('Reddit_Depression_aaj.csv',index=False)

In [17]:
pip install keybert

Collecting keybert
  Downloading keybert-0.6.0-py2.py3-none-any.whl (22 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m242.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=8fefbcf9d99b56128fba781d17adaf308f0b65f725bb93eb2a47ed212ee2a626
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers, keybert
Successfully installed keybert-0.6.0 sentence-transformers-2.2.2
[0mNote: you may need to resta

In [18]:
from keybert import KeyBERT

doc = df['aaj_clean_text']
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]