In [1]:
# Depression Analysis in Bangla
# Create depression 'arff' dataset
# copyright (c) ABDUL HASIB UDDIN <abdulhasibuddin@gmail.com>
# LICENSE: GNU General Public License v3.0

In [2]:
import numpy as np
from timeit import default_timer as timer
from collections import Counter

In [3]:
fileName = "data_all_unique_dnd_stratified_1"

In [4]:
with open('data_all_unique_dnd_stratified_text.txt', 'r', encoding="utf8") as f:
    text = f.read().split('\n')
with open('data_all_unique_dnd_stratified_labels.txt', 'r', encoding="utf8") as f:
    labels = f.read().split(' ')

In [5]:
words = ' '.join(text)
word_list = words.split()

In [6]:
num_words = Counter(word_list)
vocab = sorted(num_words, key=num_words.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

text_encoded = []
for text_data in text:
    text_encoded.append([vocab_to_int[word] for word in text_data.split()])

In [7]:
vocab_to_int_items = list(vocab_to_int.items())

print(type(vocab_to_int))
print(type(vocab_to_int_items))

print(vocab_to_int_items[:5])

<class 'dict'>
<class 'list'>
[('।', 1), (',', 2), ('!', 3), ('না', 4), ('করে', 5)]


In [8]:
text_lens = Counter([len(x) for x in text_encoded])
print("Zero-length text: {}".format(text_lens[0]))
print("Minimum text length: {}".format(min(text_lens)))
print("Maximum text length: {}".format(max(text_lens)))

Zero-length text: 1
Minimum text length: 0
Maximum text length: 63


In [9]:
# Filter out that tweets with 0 length
text_encoded = [r[0:max(text_lens)] for r in text_encoded if len(r) > 0]

In [10]:
text_lens = Counter([len(x) for x in text_encoded])
print("Zero-length text: {}".format(text_lens[0]))
print("Minimum text length: {}".format(min(text_lens)))
print("Maximum text length: {}".format(max(text_lens)))

Zero-length text: 0
Minimum text length: 1
Maximum text length: 63


In [11]:
data_list = []
for i in range(len(text_encoded)):
    padded_data = text_encoded[i]
    num_zero = max(text_lens) - len(text_encoded[i])
    if num_zero > 0:
        for z in range(num_zero):
            padded_data.append(0)
    if labels[i] != 'depressive':
        padded_data.append('non_depressive')
    else:
        padded_data.append('depressive')
    data_list.append(padded_data)

In [12]:
data_lens = Counter([len(x) for x in data_list])
print("Zero-length data: {}".format(data_lens[0]))
print("Minimum data length: {}".format(min(data_lens)))
print("Maximum data length: {}".format(max(data_lens)))
print(data_list[0])
print(data_list[-1])

Zero-length data: 0
Minimum data length: 64
Maximum data length: 64
[15, 139, 67, 852, 475, 1139, 194, 2833, 476, 2, 553, 206, 181, 56, 477, 271, 63, 14, 238, 1140, 419, 1649, 2834, 26, 2, 238, 326, 150, 1650, 1651, 26, 2, 38, 151, 8, 4, 1, 420, 2835, 175, 8, 102, 1, 33, 6, 26, 1, 12, 139, 10, 1652, 3, 2836, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'depressive']
[8004, 2832, 2832, 8005, 8006, 129, 1579, 2, 129, 1579, 8007, 103, 15, 83, 853, 1, 12, 2098, 8008, 8009, 8010, 8011, 2809, 83, 239, 421, 8012, 8013, 136, 1182, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'non_depressive']


In [35]:
arff = '@relation DepressionDataset\n\n'

for i in range(1, len(data_list[0])+1):
    if i == len(data_list[0]):
        arff += '@attribute class {depressive,non_depressive}\n\n'
        arff += '@data\n'
    else:
        arff += '@attribute '+str(i)+' NUMERIC\n'

In [36]:
#arff_file.close()

In [37]:
arff_file= open("depression_dataset.arff","a+")

arff_file.write(arff)
for data in data_list:
    for i in range(len(data)):
        if i == len(data)-1:
            arff_file.write(str(data[i]))
        else:
            arff_file.write(str(data[i]))
            arff_file.write(',')
    arff_file.write('\n')

arff_file.close()

In [24]:
print(arff)

@relation DepressionDataset

@attribute 1 NUMERIC
@attribute 2 NUMERIC
@attribute 3 NUMERIC
@attribute 4 NUMERIC
@attribute 5 NUMERIC
@attribute 6 NUMERIC
@attribute 7 NUMERIC
@attribute 8 NUMERIC
@attribute 9 NUMERIC
@attribute 10 NUMERIC
@attribute 11 NUMERIC
@attribute 12 NUMERIC
@attribute 13 NUMERIC
@attribute 14 NUMERIC
@attribute 15 NUMERIC
@attribute 16 NUMERIC
@attribute 17 NUMERIC
@attribute 18 NUMERIC
@attribute 19 NUMERIC
@attribute 20 NUMERIC
@attribute 21 NUMERIC
@attribute 22 NUMERIC
@attribute 23 NUMERIC
@attribute 24 NUMERIC
@attribute 25 NUMERIC
@attribute 26 NUMERIC
@attribute 27 NUMERIC
@attribute 28 NUMERIC
@attribute 29 NUMERIC
@attribute 30 NUMERIC
@attribute 31 NUMERIC
@attribute 32 NUMERIC
@attribute 33 NUMERIC
@attribute 34 NUMERIC
@attribute 35 NUMERIC
@attribute 36 NUMERIC
@attribute 37 NUMERIC
@attribute 38 NUMERIC
@attribute 39 NUMERIC
@attribute 40 NUMERIC
@attribute 41 NUMERIC
@attribute 42 NUMERIC
@attribute 43 NUMERIC
@attribute 44 NUMERIC
@attribute 4