Steps:
- extarct from MongoDB
- clean and transform 
- saved LDA model for topic
- sentiment analysis (VADER)
- ML model for prediction

In [71]:
import pymongo
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import models
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pickle
import pandas as pd

In [72]:

# Initializing the client
client = pymongo.MongoClient("mongodb://localhost:27017/")

# Database names containing the posts
database_names = ["healthcare_data_stream"]

# List to store documents
all_documents = []

for db_name in database_names:
    db = client[db_name]
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        cursor = collection.find({})
        for document in cursor:
            document["subreddit"] = db_name
            all_documents.append({
                "title": document.get("title"),
                "created_utc": pd.to_datetime(document.get("created_utc"), unit='s'),
                "selftext": document.get("selftext")
            })

# Converting list of dicts into a DataFrame
healthcare_df = pd.DataFrame(all_documents)
# Shape
print("Shape of the DataFrame:", healthcare_df.shape)


Shape of the DataFrame: (120, 3)


In [73]:
healthcare_df.head()

Unnamed: 0,title,created_utc,selftext
0,"Please, if you are in Cairo, hit me up.",2024-05-14 13:35:59,"I don't want to do it alone, so, please, lets ..."
1,It just takes over me,2024-05-14 13:33:02,"It will vary on days like sadness, disappointm..."
2,Days like today I wish my chronic illness was ...,2024-05-14 13:32:22,"\n3 years of this shit. Even if I recover, I’m..."
3,Meet people,2024-05-14 13:25:12,"Hello. Is there abydoby I can speak to? i,'m f..."
4,L-theanine helped me get out of bed,2024-05-14 13:07:20,So on what I thought was a long shot I bought ...


In [74]:
#check nulls
healthcare_df.isna().sum()

title          0
created_utc    0
selftext       0
dtype: int64

In [75]:
healthcare_df['text'] = healthcare_df['title'] + ' ' + healthcare_df['selftext']

In [76]:
# healthcare_df[healthcare_df['text']==" "]  # check if there is any empty string

In [77]:
#preprocessing and cleaning 
import re
import string

def clean_text(text):
    #Check NaN
    if pd.isna(text):
        return ''
    
    #Convert lowercase
    text = text.lower()
    
    #Remove text within brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    #Remove emojis
    text = text.encode('ascii', 'ignore').decode('utf-8')
    
    #Remove additional parentheses
    text = re.sub(r'\(+\)', '', text)
    
    #Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #Remove newline characters and extra whitespaces
    text = re.sub(r'\s+', ' ', text.replace('\n', ' ').strip())
    
    #Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    #Remove hashtags (words starting with '#')
    text = re.sub(r'#\w+', '', text)
    
    #Remove mentions (words starting with '@')
    text = re.sub(r'@\w+', '', text)
    
    return text

In [78]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import warnings 
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sssri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sssri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sssri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [79]:
#data cleaning 
healthcare_df['text_cleaned'] = healthcare_df['text'].apply(clean_text)
#text pre procesing
healthcare_df['text_processed'] = healthcare_df['text_cleaned'].apply(preprocess_text)

In [80]:
healthcare_df.head()

Unnamed: 0,title,created_utc,selftext,text,text_cleaned,text_processed
0,"Please, if you are in Cairo, hit me up.",2024-05-14 13:35:59,"I don't want to do it alone, so, please, lets ...","Please, if you are in Cairo, hit me up. I don'...",please if you are in cairo hit me up i dont wa...,please cairo hit dont want alone please let to...
1,It just takes over me,2024-05-14 13:33:02,"It will vary on days like sadness, disappointm...",It just takes over me It will vary on days li...,it just takes over me it will vary on days lik...,take vary day like sadness disappointment lone...
2,Days like today I wish my chronic illness was ...,2024-05-14 13:32:22,"\n3 years of this shit. Even if I recover, I’m...",Days like today I wish my chronic illness was ...,days like today i wish my chronic illness was ...,day like today wish chronic illness terminal y...
3,Meet people,2024-05-14 13:25:12,"Hello. Is there abydoby I can speak to? i,'m f...",Meet people Hello. Is there abydoby I can spea...,meet people hello is there abydoby i can speak...,meet people hello abydoby speak im feeling lonely
4,L-theanine helped me get out of bed,2024-05-14 13:07:20,So on what I thought was a long shot I bought ...,L-theanine helped me get out of bed So on what...,ltheanine helped me get out of bed so on what ...,ltheanine helped get bed thought long shot bou...


In [81]:
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load the dictionary and LDA model
dictionary = corpora.Dictionary.load("dictionary.gensim")
lda_model = models.LdaModel.load("lda_model.gensim")

# Function to preprocess a single post
def preprocess_post(post):
    tokenized_post = word_tokenize(post.lower())
    tokenized_post = [word for word in tokenized_post if word not in stopwords.words('english')]
    return tokenized_post

# Apply LDA model to each post using apply function
def infer_topic(post):
    tokenized_post = preprocess_post(post)
    bow_post = dictionary.doc2bow(tokenized_post)
    topic_distribution = lda_model.get_document_topics(bow_post)
    dominant_topic = max(topic_distribution, key=lambda x: x[1])
    return dominant_topic[0], dominant_topic[1]

# Apply the function to each row in the dataframe
healthcare_df[['topic', 'topic_probability']] = healthcare_df['text_processed'].apply(infer_topic).apply(pd.Series)

healthcare_df.head()


Unnamed: 0,title,created_utc,selftext,text,text_cleaned,text_processed,topic,topic_probability
0,"Please, if you are in Cairo, hit me up.",2024-05-14 13:35:59,"I don't want to do it alone, so, please, lets ...","Please, if you are in Cairo, hit me up. I don'...",please if you are in cairo hit me up i dont wa...,please cairo hit dont want alone please let to...,3.0,0.629661
1,It just takes over me,2024-05-14 13:33:02,"It will vary on days like sadness, disappointm...",It just takes over me It will vary on days li...,it just takes over me it will vary on days lik...,take vary day like sadness disappointment lone...,3.0,0.649418
2,Days like today I wish my chronic illness was ...,2024-05-14 13:32:22,"\n3 years of this shit. Even if I recover, I’m...",Days like today I wish my chronic illness was ...,days like today i wish my chronic illness was ...,day like today wish chronic illness terminal y...,3.0,0.664479
3,Meet people,2024-05-14 13:25:12,"Hello. Is there abydoby I can speak to? i,'m f...",Meet people Hello. Is there abydoby I can spea...,meet people hello is there abydoby i can speak...,meet people hello abydoby speak im feeling lonely,3.0,0.739658
4,L-theanine helped me get out of bed,2024-05-14 13:07:20,So on what I thought was a long shot I bought ...,L-theanine helped me get out of bed So on what...,ltheanine helped me get out of bed so on what ...,ltheanine helped get bed thought long shot bou...,0.0,0.774321


In [82]:
healthcare_df.topic.value_counts()

topic
3.0    52
0.0    35
1.0    23
2.0    10
Name: count, dtype: int64

In [83]:
# apply sentiment analysis using VADER

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()


# Function to classify sentiment as positive, negative, or neutral
def get_sentiment_label(text):
    sentiment_score = analyzer.polarity_scores(text)['compound']
    if sentiment_score >= 0.05:
        return 'Positive'
    elif sentiment_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sssri\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [84]:
# Apply sentiment analysis to each row of 'selftext_cleaned' and 'title_cleaned' columns
healthcare_df['sentiment'] = healthcare_df['text_cleaned'].apply(get_sentiment_label)
healthcare_df.head()

Unnamed: 0,title,created_utc,selftext,text,text_cleaned,text_processed,topic,topic_probability,sentiment
0,"Please, if you are in Cairo, hit me up.",2024-05-14 13:35:59,"I don't want to do it alone, so, please, lets ...","Please, if you are in Cairo, hit me up. I don'...",please if you are in cairo hit me up i dont wa...,please cairo hit dont want alone please let to...,3.0,0.629661,Positive
1,It just takes over me,2024-05-14 13:33:02,"It will vary on days like sadness, disappointm...",It just takes over me It will vary on days li...,it just takes over me it will vary on days lik...,take vary day like sadness disappointment lone...,3.0,0.649418,Negative
2,Days like today I wish my chronic illness was ...,2024-05-14 13:32:22,"\n3 years of this shit. Even if I recover, I’m...",Days like today I wish my chronic illness was ...,days like today i wish my chronic illness was ...,day like today wish chronic illness terminal y...,3.0,0.664479,Negative
3,Meet people,2024-05-14 13:25:12,"Hello. Is there abydoby I can speak to? i,'m f...",Meet people Hello. Is there abydoby I can spea...,meet people hello is there abydoby i can speak...,meet people hello abydoby speak im feeling lonely,3.0,0.739658,Negative
4,L-theanine helped me get out of bed,2024-05-14 13:07:20,So on what I thought was a long shot I bought ...,L-theanine helped me get out of bed So on what...,ltheanine helped me get out of bed so on what ...,ltheanine helped get bed thought long shot bou...,0.0,0.774321,Positive


In [85]:
healthcare_df.sentiment.value_counts()

sentiment
Negative    79
Positive    35
Neutral      6
Name: count, dtype: int64

In [86]:
healthcare_df.head()

Unnamed: 0,title,created_utc,selftext,text,text_cleaned,text_processed,topic,topic_probability,sentiment
0,"Please, if you are in Cairo, hit me up.",2024-05-14 13:35:59,"I don't want to do it alone, so, please, lets ...","Please, if you are in Cairo, hit me up. I don'...",please if you are in cairo hit me up i dont wa...,please cairo hit dont want alone please let to...,3.0,0.629661,Positive
1,It just takes over me,2024-05-14 13:33:02,"It will vary on days like sadness, disappointm...",It just takes over me It will vary on days li...,it just takes over me it will vary on days lik...,take vary day like sadness disappointment lone...,3.0,0.649418,Negative
2,Days like today I wish my chronic illness was ...,2024-05-14 13:32:22,"\n3 years of this shit. Even if I recover, I’m...",Days like today I wish my chronic illness was ...,days like today i wish my chronic illness was ...,day like today wish chronic illness terminal y...,3.0,0.664479,Negative
3,Meet people,2024-05-14 13:25:12,"Hello. Is there abydoby I can speak to? i,'m f...",Meet people Hello. Is there abydoby I can spea...,meet people hello is there abydoby i can speak...,meet people hello abydoby speak im feeling lonely,3.0,0.739658,Negative
4,L-theanine helped me get out of bed,2024-05-14 13:07:20,So on what I thought was a long shot I bought ...,L-theanine helped me get out of bed So on what...,ltheanine helped me get out of bed so on what ...,ltheanine helped get bed thought long shot bou...,0.0,0.774321,Positive


In [87]:
finaldf=healthcare_df[['text_processed','topic','sentiment']]
finaldf.head()

Unnamed: 0,text_processed,topic,sentiment
0,please cairo hit dont want alone please let to...,3.0,Positive
1,take vary day like sadness disappointment lone...,3.0,Negative
2,day like today wish chronic illness terminal y...,3.0,Negative
3,meet people hello abydoby speak im feeling lonely,3.0,Negative
4,ltheanine helped get bed thought long shot bou...,0.0,Positive


In [88]:
# with open('best_model_split_0.8.pkl', 'rb') as f:
#     model = pickle.load(f)

In [89]:
# finaldf.rename(columns={'sentiment':'sentiment_1'},inplace=True)
# finaldf.columns

In [90]:
finaldf.head()

Unnamed: 0,text_processed,topic,sentiment
0,please cairo hit dont want alone please let to...,3.0,Positive
1,take vary day like sadness disappointment lone...,3.0,Negative
2,day like today wish chronic illness terminal y...,3.0,Negative
3,meet people hello abydoby speak im feeling lonely,3.0,Negative
4,ltheanine helped get bed thought long shot bou...,0.0,Positive


In [91]:
from sklearn.preprocessing import LabelEncoder
# # encoding the cat to num 
# # neg- 0 , pos- 2, neutral-1
encoder = LabelEncoder()
finaldf['sentiment'] = encoder.fit_transform(finaldf['sentiment'])
finaldf.head()



# # use the sentiment_encoder saved model
# # Load the model
# with open('sentiment_encoder_final.pkl', 'rb') as f:
#     sentiment_encoder_model = pickle.load(f)

Unnamed: 0,text_processed,topic,sentiment
0,please cairo hit dont want alone please let to...,3.0,2
1,take vary day like sadness disappointment lone...,3.0,0
2,day like today wish chronic illness terminal y...,3.0,0
3,meet people hello abydoby speak im feeling lonely,3.0,0
4,ltheanine helped get bed thought long shot bou...,0.0,2


In [92]:
import joblib
# vectorizer
vectorizer = joblib.load('vectorizer.pkl')

In [93]:
X_text = vectorizer.transform(finaldf['text_processed'])

In [94]:
# change the types to str
finaldf['topic']=finaldf['topic'].astype(str)
finaldf['sentiment']=finaldf['sentiment'].astype(str)

In [95]:
finaldf.dtypes

text_processed    object
topic             object
sentiment         object
dtype: object

In [96]:
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), pd.DataFrame(finaldf, columns=['sentiment','topic'])],
                        axis=1)

# X_numerical

In [97]:
X_numerical.columns = X_numerical.columns.astype(str)

In [98]:
best_model = joblib.load('final_model_split_0.8.pkl')  # load the model

predictions = best_model.predict(X_numerical)

In [99]:

# Add predictions to the dataframe
finaldf['prediction'] = predictions


In [100]:
finaldf

Unnamed: 0,text_processed,topic,sentiment,prediction
0,please cairo hit dont want alone please let to...,3.0,2,1
1,take vary day like sadness disappointment lone...,3.0,0,1
2,day like today wish chronic illness terminal y...,3.0,0,1
3,meet people hello abydoby speak im feeling lonely,3.0,0,0
4,ltheanine helped get bed thought long shot bou...,0.0,2,1
...,...,...,...,...
115,really last word first time second time realized,1.0,1,1
116,actually brushed teeth today actually brushed ...,1.0,0,1
117,relationship withdrawal weight alive feel much...,3.0,0,1
118,soon get close someone trauma dump share title...,3.0,0,1


In [None]:
# neg- 0 , pos- 2, neutral-1
# Label 0: anxiety
# Label 1: depression
# Topic 0 : Seeking Support
# Topic 1 : Life Events and Relationships
# Topic 2 : Social Anxiety and Work Challenges
# Topic 3 : Difficulty with Relationships and Life in General

In [103]:
finaldf.topic[0]

'3.0'

In [106]:
finaldf.prediction.value_counts()

prediction
anxiety       62
depression    58
Name: count, dtype: int64

In [104]:
# Define mappings
label_mapping = {0: "anxiety", 1: "depression"}
topic_mapping = {'0.0': "Seeking Support", '1.0': "Life Events and Relationships", '2.0': "Social Anxiety and Work Challenges", '3.0': "Difficulty with Relationships and Life in General"}
sentiment_mapping = {'0': "Negative", '1': "Neutral", '2': "Positive"}

In [105]:
# Map labels, topics, and sentiments to their corresponding names
finaldf["prediction"] = finaldf["prediction"].map(label_mapping)
finaldf["topic"] = finaldf["topic"].map(topic_mapping)
finaldf["sentiment"] = finaldf["sentiment"].map(sentiment_mapping)


finaldf.head()

Unnamed: 0,text_processed,topic,sentiment,prediction
0,please cairo hit dont want alone please let to...,Difficulty with Relationships and Life in General,Positive,depression
1,take vary day like sadness disappointment lone...,Difficulty with Relationships and Life in General,Negative,depression
2,day like today wish chronic illness terminal y...,Difficulty with Relationships and Life in General,Negative,depression
3,meet people hello abydoby speak im feeling lonely,Difficulty with Relationships and Life in General,Negative,anxiety
4,ltheanine helped get bed thought long shot bou...,Seeking Support,Positive,depression


In [107]:
finaldf.topic.value_counts()

topic
Difficulty with Relationships and Life in General    52
Seeking Support                                      35
Life Events and Relationships                        23
Social Anxiety and Work Challenges                   10
Name: count, dtype: int64

In [108]:
finaldf.sentiment.value_counts()

sentiment
Negative    79
Positive    35
Neutral      6
Name: count, dtype: int64

In [145]:
finaldf.head()

Unnamed: 0,text_processed,topic,sentiment,prediction
0,please cairo hit dont want alone please let to...,Difficulty with Relationships and Life in General,Positive,depression
1,take vary day like sadness disappointment lone...,Difficulty with Relationships and Life in General,Negative,depression
2,day like today wish chronic illness terminal y...,Difficulty with Relationships and Life in General,Negative,depression
3,meet people hello abydoby speak im feeling lonely,Difficulty with Relationships and Life in General,Negative,anxiety
4,ltheanine helped get bed thought long shot bou...,Seeking Support,Positive,depression


#### single input string 

In [142]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim import corpora, models
from sklearn.preprocessing import LabelEncoder
import joblib
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import string

# Function to clean text data
def clean_text(text):
    # Check for NaN
    if pd.isna(text):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove text within brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('utf-8')
    
    # Remove additional parentheses
    text = re.sub(r'\(+\)', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove newline characters and extra whitespaces
    text = re.sub(r'\s+', ' ', text.replace('\n', ' ').strip())
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove hashtags (words starting with '#')
    text = re.sub(r'#\w+', '', text)
    
    # Remove mentions (words starting with '@')
    text = re.sub(r'@\w+', '', text)
    
    return text

# Function to preprocess text data
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Load the pre-trained model and vectorizer
vectorizer = joblib.load('vectorizer.pkl')
best_model = joblib.load('final_model_split_0.8.pkl')
lda_model = models.LdaModel.load("lda_model.gensim")
dictionary = corpora.Dictionary.load("dictionary.gensim")
# Load sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define mappings
label_mapping = {0: "anxiety", 1: "depression"}
topic_mapping = {'0': "Seeking Support", '1': "Life Events and Relationships", '2': "Social Anxiety and Work Challenges", '3': "Difficulty with Relationships and Life in General"}
sentiment_mapping = {'0': "Negative", '1': "Neutral", '2': "Positive"}

# Function to classify sentiment as positive, negative, or neutral
def get_sentiment_label(text):
    sentiment_score = analyzer.polarity_scores(text)['compound']
    if sentiment_score >= 0.05:
        return 'Positive'
    elif sentiment_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# User text
user_text = "i wanna die"

# Data preprocessing
cleaned_text = clean_text(user_text)
preprocessed_text = preprocess_text(cleaned_text)

# Sentiment analysis
sentiment = get_sentiment_label(preprocessed_text)
encoder = LabelEncoder()
sentiment = encoder.fit_transform([sentiment])[0]

# Apply LDA model to each post using apply function
def infer_topic(post):
    tokenized_post = preprocess_text(post)
    bow_post = dictionary.doc2bow(tokenized_post.split())
    topic_distribution = lda_model.get_document_topics(bow_post)
    dominant_topic = max(topic_distribution, key=lambda x: x[1])
    return dominant_topic[0], dominant_topic[1]

# Apply the function to the user text
topic, topic_probability = infer_topic(preprocessed_text)

# Prediction using pre-trained model
X_text = vectorizer.transform([preprocessed_text])
X_text_df = pd.DataFrame(X_text.toarray())
X_numerical = pd.concat([X_text_df, pd.DataFrame({'sentiment': sentiment, 'topic': str(topic)}, index=[0])], axis=1)
X_numerical.columns = X_numerical.columns.astype(str)

# Predict label
prediction = best_model.predict(X_numerical)
label = label_mapping[prediction[0]]

# Display result
print("Analysis Result:")
print(f"Emotional State: {label}")
print(f"Sentiment: {sentiment_mapping[str(sentiment)]}")
print(f"Topic: {topic_mapping[str(topic)]} (Probability: {topic_probability:.2f})")



Analysis Result:
Emotional State: depression
Sentiment: Negative
Topic: Difficulty with Relationships and Life in General (Probability: 0.81)


In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from gensim import corpora, models
from sklearn.preprocessing import LabelEncoder
import joblib
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import string

# Function to clean text data
def clean_text(text):
    # Check for NaN
    if pd.isna(text):
        return ''

    # Convert to lowercase
    text = text.lower()

    # Remove text within brackets
    text = re.sub(r'\[.*?\]', '', text)

    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('utf-8')

    # Remove additional parentheses
    text = re.sub(r'\(+\)', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove newline characters and extra whitespaces
    text = re.sub(r'\s+', ' ', text.replace('\n', ' ').strip())

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove hashtags (words starting with '#')
    text = re.sub(r'#\w+', '', text)

    # Remove mentions (words starting with '@')
    text = re.sub(r'@\w+', '', text)

    return text

# Function to preprocess text data
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Load the pre-trained model and vectorizer
vectorizer = joblib.load('vectorizer.pkl')
best_model = joblib.load('final_model_split_0.8.pkl')
lda_model = models.LdaModel.load("lda_model.gensim")
dictionary = corpora.Dictionary.load("dictionary.gensim")
# Load sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define mappings
label_mapping = {0: "anxiety", 1: "depression"}
topic_mapping = {'0': "Seeking Support", '1': "Life Events and Relationships", '2': "Social Anxiety and Work Challenges", '3': "Difficulty with Relationships and Life in General"}
sentiment_mapping = {'0': "Negative", '1': "Neutral", '2': "Positive"}

# Function to classify sentiment as positive, negative, or neutral
def get_sentiment_label(text):
    sentiment_score = analyzer.polarity_scores(text)['compound']
    if sentiment_score >= 0.05:
        return 'Positive'
    elif sentiment_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# User text
user_text = "i am happy to be here to share my experience realted to my past depressin "

# Data preprocessing
cleaned_text = clean_text(user_text)
preprocessed_text = preprocess_text(cleaned_text)

# Define words not related to depression/anxiety (adjust this list)
irrelevant_words = ["happy", "joyful", "excited", "good", "great", "celebrating", "achievement"]

# Check for irrelevant words (early exit if present)
if any(word in preprocessed_text.split() for word in irrelevant_words):
    print("Analysis Result:")
    print(f"Emotional State: Not Depression/Not Anxiety")
    exit()  

# Sentiment analysis
sentiment = get_sentiment_label(preprocessed_text)
encoder = LabelEncoder()
sentiment = encoder.fit_transform([sentiment])[0]
# Apply LDA model to each post using apply function
def infer_topic(post):
    tokenized_post = preprocess_text(post)
    bow_post = dictionary.doc2bow(tokenized_post.split())
    topic_distribution = lda_model.get_document_topics(bow_post)
    dominant_topic = max(topic_distribution, key=lambda x: x[1])
    return dominant_topic[0], dominant_topic[1]

# Apply the function to the user text
topic, topic_probability = infer_topic(preprocessed_text)

# Prediction using pre-trained model
X_text = vectorizer.transform([preprocessed_text])
X_text_df = pd.DataFrame(X_text.toarray())
X_numerical = pd.concat([X_text_df, pd.DataFrame({'sentiment': sentiment, 'topic': str(topic)}, index=[0])], axis=1)
X_numerical.columns = X_numerical.columns.astype(str)

# Predict label
prediction = best_model.predict(X_numerical)
label = label_mapping[prediction[0]]

# Display result
print("Analysis Result:")
print(f"Emotional State: {label}")
print(f"Sentiment: {sentiment_mapping[str(sentiment)]}")
print(f"Topic: {topic_mapping[str(topic)]} (Probability: {topic_probability:.2f})")

Analysis Result:
Emotional State: Not Depression/Not Anxiety
Analysis Result:
Emotional State: depression
Sentiment: Negative
Topic: Social Anxiety and Work Challenges (Probability: 0.51)


In [165]:
a=finaldf[finaldf['sentiment']=="Neutral"][['prediction','text_processed']]
a.text_processed

12                                     could reason girl
17     taking total xanax everyday much take morning ...
37      really last word first time second time realized
55                                     could reason girl
94     taking total xanax everyday much take morning ...
115     really last word first time second time realized
Name: text_processed, dtype: object

In [166]:
a.text_processed[94]

'taking total xanax everyday much take morning night much'

In [167]:
finaldf.shape

(120, 4)

Unnamed: 0,text_processed,topic,sentiment,prediction
0,please cairo hit dont want alone please let to...,Difficulty with Relationships and Life in General,Positive,depression
1,take vary day like sadness disappointment lone...,Difficulty with Relationships and Life in General,Negative,depression
2,day like today wish chronic illness terminal y...,Difficulty with Relationships and Life in General,Negative,depression
3,meet people hello abydoby speak im feeling lonely,Difficulty with Relationships and Life in General,Negative,anxiety
4,ltheanine helped get bed thought long shot bou...,Seeking Support,Positive,depression
...,...,...,...,...
108,relationship im year old adult nil experience ...,Social Anxiety and Work Challenges,Positive,anxiety
109,feeling threatened trauma response replay flip...,Difficulty with Relationships and Life in General,Positive,anxiety
112,anyone sh option normally used cut got boring ...,Difficulty with Relationships and Life in General,Negative,depression
118,soon get close someone trauma dump share title...,Difficulty with Relationships and Life in General,Negative,depression
