# Cleaning Datasets

In [1]:
import pandas as pd
import re
import regex

In [2]:
# Load the datasets
depression_data = pd.read_excel('C:/Users/Acer/Desktop/thesis/tweets/Original_Depression.xlsx', header=1)
non_depression_data = pd.read_excel('C:/Users/Acer/Desktop/thesis/tweets/Original_NonDepression.xlsx', header=1)

In [3]:
# Function to clean text data
def clean_text_data(text):
    # Remove URLs
    text = regex.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove non-English characters
    text = regex.sub(r'[^\p{Latin}\s]', '', text)
    # Remove various symbols and emojis
    symbols_to_remove = ['♥️', '\n', '=', '"', '#', '?', '{', '}', '()', '\\', '[]', '🍉', '/', '!', '.', '😘']
    for symbol in symbols_to_remove:
        text = text.replace(symbol, '')
    # Remove usernames
    text = re.sub(r'@\w+\b', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

In [4]:
# Apply cleaning functions, add labels, and keep only relevant columns
def process_data(data, label):
    # Ensure to work on a copy to avoid SettingWithCopyWarning
    data = data.copy()
    # Drop rows where 'text' is NaN
    data = data.dropna(subset=['text'])
    # Apply cleaning to the 'text' column
    data.loc[:, 'text'] = data['text'].apply(clean_text_data)
    # Add label
    data.loc[:, 'label'] = label
    # Keep only 'text' and 'label' columns
    return data[['text', 'label']]

In [5]:
# Process both datasets
cleaned_depression_data = process_data(depression_data, 'Depression')
cleaned_non_depression_data = process_data(non_depression_data, 'Non depression')


In [6]:
# Save cleaned data
cleaned_depression_data.to_excel('cleaned_depression_data.xlsx', index=False)
cleaned_non_depression_data.to_excel('cleaned_non_depression_data.xlsx', index=False)


In [7]:
df1 = cleaned_depression_data
df1

Unnamed: 0,text,label
4,RT EichinChangLim In Talking About Adolescence...,Depression
5,RT levelsio One counterintuitive thing about s...,Depression
6,RT dlhampton The Neuroscience of How Affirmati...,Depression
7,autism adhd anxiety depression and bod I won t...,Depression
8,JustJul GeoffField I know a colleague of mine...,Depression
...,...,...
1317,RT anxietymsgs Having anxiety and depression i...,Depression
1318,snj To be honest with you no meds had ever cur...,Depression
1319,RT SincerelyAO For everyone dealing with anxie...,Depression
1320,JessicaUSAF Warningvery dark humor aheadnDo no...,Depression


In [8]:
df2 = cleaned_non_depression_data
df2

Unnamed: 0,text,label
4,SharronS Hello there Thanks for reaching out I...,Non depression
5,RT ANI WATCH In Sandeshkhali West Bengal LoP ...,Non depression
6,RT AerthH Retinol Happy with Retinol Ant...,Non depression
7,Hahahahahahha happy ruharu aaaaa,Non depression
8,RT JacksonJason GM Happy Tuesday,Non depression
...,...,...
973,RT vernonlovebot THEYRE STILL FRIENDS IM SO HA...,Non depression
974,RT softinseo one cute happy familyyy,Non depression
975,RT justluciano Sucking big cocks makes me so h...,Non depression
976,RT hourlyssw HAPPY WENDY DAYTHIRTYSEXYWENDY,Non depression


In [9]:
# Merge both datasets
merged_data = pd.concat([cleaned_depression_data, cleaned_non_depression_data], ignore_index=True)

In [10]:
# Shuffle the merged data randomly
merged_data = merged_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
# Save shuffled merged data
merged_data.to_excel('merged_cleaned_data.xlsx', index=False)
# Print a success message
print("Data processing complete. Files saved.")

Data processing complete. Files saved.


In [12]:
merged_df = merged_data
merged_df

Unnamed: 0,text,label
0,RT felpsENG after a donate saying that he had ...,Depression
1,RT WinBunny Morning Win nnHappy Friday my su...,Depression
2,AndreaB Antidepressants almost ruined my life ...,Depression
3,RT GuntherEagleman Happy Presidents Day dad,Non depression
4,THIS JOB MY YEARS DAUGHTERI HAPPY WITH THIS G...,Non depression
...,...,...
1215,RT FentyStats Happy th birthday to the one and...,Non depression
1216,RT nSSignofficial TikTok NaNaNa nSSignHap...,Non depression
1217,RT idoldotst The lyric video for Kozue Tsuzuri...,Non depression
1218,ayeshakhan n nAmeen nnStay happy nS...,Non depression


In [13]:
# Calculate the count of each category in the 'label' column
counts = merged_df['label'].value_counts()

# Calculate the percentage of each category
percentages = (counts / len(merged_df)) * 100

# Print the results
print("Percentage of Depression Tweets: {:.2f}%".format(percentages['Depression']))
print("Percentage of Non-Depression Tweets: {:.2f}%".format(percentages['Non depression']))


Percentage of Depression Tweets: 59.02%
Percentage of Non-Depression Tweets: 40.98%


# SPLITTING

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming you have already loaded your dataset into merged_df

# Split the data into train and test sets
train_df, test_df = train_test_split(merged_df, test_size=0.25, random_state=42)

# Optionally, if you need to reset the index of the splits
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [15]:
test_df

Unnamed: 0,text,label
0,RT WayVofficial WayZenNies We will attend the ...,Depression
1,Happy questing Just got uncommon insight Yoga...,Depression
2,a psychologist diagnosed me with depression ye...,Depression
3,RT thecoopertom FWA a young girl was attendin...,Non depression
4,RT fkaannabanana Happy Tuesday yall Lets make...,Non depression
...,...,...
300,DaveLeghorn Antidepressants almost ruined my l...,Depression
301,RT xoxo HAPPY ZAYYAN DAYnn XODIAC ZAYYAN HAPP...,Depression
302,WendySonVN WRCupsleevePH RVsmtown HAPPY WENDY ...,Non depression
303,RT belswen kita dipaksa melewati stages of gr...,Depression
