In [2]:
!pip install scikit-learn
!pip install pandas
!pip install emoji
!pip install num2words
!pip install nltk
!pip install matplotlib



In [3]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import emoji
#from num2words import num2words
import nltk
#from nltk.corpus import stopwords
#from nltk.stem import WordNetLemmatizer
#from nltk.stem import PorterStemmer
#nltk.download('stopwords')
#nltk.download('wordnet')
#import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
#from matplotlib import colors
#from wordcloud import WordCloud

In [44]:
def read_df_from_json(doc, label):
    data_list = []

    with open(doc, 'r') as file:
        data = json.load(file)
        
        for user in data:
            for post in user['posts']:
                data_list.append({"label": label, "text": post["title"] + " " + post["selftext"]})

        df = pd.DataFrame(data_list)
    
        return df

In [45]:
def concatenate_dfs(docs, labels):
    dfs = []

    for i in range(len(docs)):
        dfs.append(read_df_from_json(docs[i], labels[i]))
    
    big_df = pd.concat(dfs, ignore_index=True)
    
    return big_df

In [46]:
docs = ['data/reddit_data_with_depression.json','data/reddit_data_with_true_depression.json','data/reddit_data_without_depression.json']
labels = ['possible_depression', 'true_depression', 'no_depression']
df  = concatenate_dfs(docs, labels)

In [47]:
df

Unnamed: 0,label,text
0,possible_depression,"Stare mentala nasoala - attachment issues, fri..."
1,possible_depression,"Problema serioasa - transpiratia Salut,\n\nAm ..."
2,possible_depression,Ce peeling chimic imi recomandati? Buna fetelo...
3,possible_depression,Ce produse de skincare bune reusesc sa va ia o...
4,possible_depression,"Relatia cu sora mea mai mica \nBuna, am si eu ..."
...,...,...
4086,no_depression,Rosiile alea la 40lei/kg sunt ieftine Poza fac...
4087,no_depression,Luminati-ma sau fiti luminati. Care e faza cu ...
4088,no_depression,De ce e asa molipsitor accentul ardelenesc? Su...
4089,no_depression,Va simtiti amenintati de AI? Cum priviti acest...


In [48]:
print("Possible Depression:", df["label"].value_counts()["possible_depression"])
print("True Depression:", df["label"].value_counts()["true_depression"])
print("No Depression:", df["label"].value_counts()["no_depression"])

Possible Depression: 949
True Depression: 211
No Depression: 2931


In [53]:
def split_and_write_csv(input_df, train_size_per_class={'possible_depression':500, 'true_depression':100, 'no_depression':1500}, test_size_per_class={'possible_depression':250, 'true_depression':75, 'no_depression':1000}, validation_size_per_class={'possible_depression':199, 'true_depression':36, 'no_depression':431}):
    grouped = input_df.groupby('label')

    train_data = pd.DataFrame()
    train_labels = pd.DataFrame()
    test_data = pd.DataFrame()
    test_labels = pd.DataFrame()
    validation_data = pd.DataFrame()
    validation_labels = pd.DataFrame()
    
    train_data_plot = pd.DataFrame()
    test_data_plot = pd.DataFrame()
    validation_data_plot = pd.DataFrame()

    for group_name, group_df in grouped:
        # print(test_size_per_class[group_name])
        shuffled_group_df = group_df.sample(frac=1)

        train_group, test_group = train_test_split(shuffled_group_df, train_size=train_size_per_class[group_name] , test_size=test_size_per_class[group_name] + validation_size_per_class[group_name])
    
        test_group, validation_group = train_test_split(test_group, test_size=validation_size_per_class[group_name])

        train_data = pd.concat([train_data, train_group['text']])
        train_labels = pd.concat([train_labels, train_group['label']])

        test_data = pd.concat([test_data, test_group['text']])
        test_labels = pd.concat([test_labels, test_group['label']])
    
        validation_data = pd.concat([validation_data, validation_group['text']])
        validation_labels = pd.concat([validation_labels, validation_group['label']])
    
        train_data_plot = pd.concat([train_data_plot, train_group])  
        test_data_plot = pd.concat([test_data_plot, test_group])
        validation_data_plot = pd.concat([validation_data_plot, validation_group])
        
    
    shuffled_train_data = train_data.sample(frac=1)
    shuffled_train_labels = train_labels.sample(frac=1)

    shuffled_train_data.rename(columns={0: "text"}, inplace=True)
    shuffled_train_labels.rename(columns={0: "label"}, inplace=True)
    

    validation_data.rename(columns={0: "text"}, inplace=True)
    validation_labels.rename(columns={0: "label"}, inplace=True)
    
    test_data.rename(columns={0: "text"}, inplace=True)
    test_labels.rename(columns={0: "label"}, inplace=True)
    
    train_data.to_csv('data/shuffled_train_data.csv', index=False)
    train_labels.to_csv('data/shuffled_train_labels.csv', index=False)
    validation_data.to_csv('data/validation_data.csv', index=False)
    validation_labels.to_csv('data/validation_labels.csv', index=False)
    test_data.to_csv('data/test_data.csv', index=False)
    test_labels.to_csv('data/test_labels.csv', index=False)
    train_data_plot.to_csv('data/train_data_plot.csv', index=False)
    test_data_plot.to_csv('data/test_data_plot.csv', index=False)
    validation_data_plot.to_csv('data/validation_data_plot.csv', index=False)


In [54]:
split_and_write_csv(df)

In [55]:
train_data = pd.read_csv('data/shuffled_train_data.csv',index_col=None)
train_labels = pd.read_csv('data/shuffled_train_labels.csv',index_col=None)

test_data = pd.read_csv('data/test_data.csv',index_col=None)
test_labels = pd.read_csv('data/test_labels.csv',index_col=None)

val_data = pd.read_csv('data/validation_data.csv',index_col=None)
val_labels = pd.read_csv('data/validation_labels.csv',index_col=None)

train_data_plot = pd.read_csv('data/train_data_plot.csv',index_col=None)
test_data_plot = pd.read_csv('data/test_data_plot.csv',index_col=None)

train_data.shape

(2100, 1)