In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
infamouscoder_mental_health_social_media_path = kagglehub.dataset_download('infamouscoder/mental-health-social-media')

print('Data source import complete.')


Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

Get the dataset

In [None]:
df = pd.read_csv('/kaggle/input/mental-health-social-media/Mental-Health-Twitter.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

Clearly there are no missing values in the dataset. But in case if missing values occur we can use certain methods like:
- deleting the rows with missing values
- deleting the column with too many missing values
- filling the missing values with mean, median or mode of the column
- using machine learning algorithms to predict the missing values for ex using regression to predict the missing values of a column.
- using clustering algorithms to group the data points with missing values and then using the mean of the cluster to fill the missing values.
- flag the missing values as a separate category.

 Now let's drop irrelevant columns from the dataset. The 'Unnamed: 0' here is the extra column which is not required for our analysis. So we will drop it. Also the 'id' column is not required for our analysis as it is just a unique identifier for each row. So we will drop it too.

In [None]:
df.drop(columns=['Unnamed: 0', 'post_id', 'user_id'], inplace=True)
df.head()

Convert the time to post creation to datetime format and use seperate columns for year, month and day.

In [None]:
df.post_created=df.post_created.apply(pd.to_datetime)

df["month"]=df.post_created.dt.month
df["year"]=df.post_created.dt.year
df["day"]=df.post_created.dt.day

df.drop("post_created", axis=1)
df.head()

Find the pearson correlation coefficient between the variables.

- For r = 1, there is a perfect positive correlation between the variables.
- For 0 < r < 1 there is a positive correlation between the variables.
- For r = 0, there is no correlation between the variables.
- For -1 < r < 0 there is a negative correlation between the variables.
- For r = -1, there is a perfect negative correlation between the variables.

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(df, 0.85)
corr_features

In [None]:
df = df.drop(columns=corr_features)
df

Since Friends and Followers are highly correlated, we have dropped the Friends column.

In [None]:
df.describe()

PREPROCESSING

In [None]:
#Convert to lower case
df["post_text"] = df["post_text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

#remove punctuation
df["post_text"] = df["post_text"].str.replace('[^\w\s]','')

#remove numbers
df["post_text"] = df["post_text"].str.replace('\d','')

In [None]:
#remove stopwords
#nltk.download('stopwords')
sw = stopwords.words("english")
df["post_text"] = df["post_text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [None]:
#Stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    stemmed_text = " ".join([stemmer.stem(word) for word in text.split()])
    return stemmed_text

df['post_text'] = df['post_text'].apply(stem_text)


In [None]:
#tokenization
from textblob import TextBlob
#nltk.download("punkt")
df["tokens"] = df["post_text"].apply(lambda x: TextBlob(x).words)

In [None]:
#TF-IDF
vectorizer_tf = TfidfVectorizer(stop_words="english", max_features=1000)
X_tf = vectorizer_tf.fit_transform(df["post_text"])


In [None]:
vectorizer_tf.vocabulary_

In [None]:
#idf of each word
all_feature_names = vectorizer_tf.get_feature_names_out()

for word in all_feature_names:

    #let's get the index in the vocabulary
    indx = vectorizer_tf.vocabulary_.get(word)

    #get the score
    idf_score = vectorizer_tf.idf_[indx]

    print(f"{word} : {idf_score}")

In [None]:
#output of tf-idf
X_tf.toarray()

Cosine Similarity measure

In [None]:
#from sklearn.metrics.pairwise import cosine_similarity
#
#cos_sim = cosine_similarity(X_tf[:1000])
#
#df1 = df[:1000]
#for i in range(len(df1['post_text'])):
#    for j in range(len(df1['post_text'])):
#        print(f"Cosine similarity between document {i+1} and document {j+1}: {cos_sim[i][j]}")


Implementing Similarity Measure for Text Processing(SMTP).

In [None]:
#getting the unique features in the document(tweet)and creating another column

def get_unique_words(tweet):
    words = tweet.split()
    unique_words = list(set(words))
    return unique_words

df1 = df[:1000]
for i in range(1000):
    df1['unique_words'] = df1['post_text'].apply(lambda x: get_unique_words(x))

df1.head()

In [None]:
def get_features(lists):
    features = [word for sublist in lists for word in sublist if len(word) > 3]
    return features

df2 = df[:500]
df3 = df[19500:]

print(df2.head())

df4 = pd.concat([df2, df3], axis=0)
print(df4.head())

req_lst = df4['tokens'].tolist()
req_lst[:5]
#selected_features = get_features(req_lst)
#print(len(selected_features))
#selected_features

In [None]:
tweet_column = []
for sublist in req_lst:
    tweet = ' '.join(sublist)  # Join the words in each sublist to form a single string
    tweet_column.append(tweet)

print(len(tweet_column))
tweet_column[:5]

In [None]:
from collections import Counter

# Calculate word count vectors for pairs of two documents
def calculate_word_count_vectors(documents):

    word_count_vectors = []

    for i in range(len(tweet_column)):
        for j in range(0, len(tweet_column)):
            document1 = tweet_column[i]
            document2 = tweet_column[j]

            # Create a set of unique words from both documents
            words = set(document1.split()) | set(document2.split())

            # Calculate word count vectors for the selected words
            d1 = [Counter(document1.split()).get(feature, 0) for feature in words]
            d2 = [Counter(document2.split()).get(feature, 0) for feature in words]

            # Format the word count vectors
            d1_formatted = "".join(str(count) for count in d1)
            d2_formatted = "".join(str(count) for count in d2)

            # Print the word count vectors
            #print(f"d{i+1}, d{j+1} =", d1_formatted, d2_formatted)
            #print()

            word_count_vectors.append(f"d{i+1} = {d1_formatted}")
            word_count_vectors.append(f"d{j+1} = {d2_formatted}")

    return word_count_vectors

document_pairs = calculate_word_count_vectors(tweet_column)
print(len(document_pairs))
document_pairs[:10]

In [None]:
#F1(di,dj) function to calculate

import math

def calculate_similarity_score(d1, d2, sigma, lambd):
    num = 0
    den = 0

    for d1j, d2j in zip(d1, d2):
        num += calculate_N_star(d1j, d2j, sigma, lambd)
        den += calculate_N_union(d1j, d2j)

    if den == 0:
        return 0
    else:
        return num / den

def calculate_N_star(d1j, d2j, sigma, lambd):
    if d1j == 0 and d2j == 0:
        return 0

    if d1j > 0 and d2j > 0:
        return 0.5 * (1 + math.exp(-1 * ((d1j - d2j) / sigma)**2))

    return -lambd

def calculate_N_union(d1j, d2j):
    if d1j == 0 and d2j == 0:
        return 0

    return 1

In [None]:
# Example usage
d1 = [0, 2, 1, 1, 0, 0, 1]
d2 = [3, 1, 1, 1, 1, 0, 0]
sigma = 2
lambd = 1

result = calculate_similarity_score(d1, d2, sigma, lambd)
print(result)

In [None]:
#SMTP

def calculate_SMTP(d1, d2, sigma, lambd):
    f_score = calculate_similarity_score(d1, d2, sigma, lambd)
    smtp_score = (f_score + lambd) / (1 + lambd)
    return smtp_score

In [None]:
result = calculate_SMTP(d1, d2, sigma, lambd)
print(result)

In [None]:
len(document_pairs)

In [None]:
lst1 = []
lst2 = []
lst = []

for i in range(0, len(document_pairs), 2):
    dx = document_pairs[i].split(' = ')[1]
    dy = document_pairs[i + 1].split(' = ')[1]

    dxi = [int(x) for x in dx]
    dyi = [int(y) for y in dy]

    smtp_score = calculate_SMTP(dxi, dyi, sigma, lambd)
    lst.append(smtp_score)
    #op = (f"SMTP score = {smtp_score}")
    #print(op)
    #lst1.clear()
    #lst2.clear()

print(len(lst))
lst[:5]

In [None]:
lst_matrix = np.array(lst).reshape(1000, 1000)
lst_matrix

In [None]:
sns.heatmap(lst_matrix, cmap='hot')
plt.show()

SVM Model

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split

s = df4['label']

# Convert the series to a matrix
label_matrix = s.values

# Reshape the matrix if needed
label_matrix = label_matrix.reshape((1000,))

unique_val = np.unique(label_matrix)
print(unique_val)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(lst_matrix, label_matrix, test_size=0.2, random_state=42)
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
svm_acc = clf.score(X_test, y_test)
print("Accuracy of the model is:", svm_acc)

Naive Bayes Classifier

In [None]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred)
print("Accuracy of the model is:", nb_acc)

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)

log_pred = log_model.predict(X_test)
log_acc = accuracy_score(y_test, log_pred)

print("Accuracy of the model is:", log_acc)


Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print("Accuracy of the model is:", rf_acc)