In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Combined Data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
df.isnull().sum()

Unnamed: 0      0
statement     362
status          0
dtype: int64

In [5]:
df.dtypes

Unnamed: 0     int64
statement     object
status        object
dtype: object

In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,53043.0
mean,26521.0
std,15312.339501
min,0.0
25%,13260.5
50%,26521.0
75%,39781.5
max,53042.0


In [7]:
# Drop rows where 'statement' is null
df.dropna(subset=['statement'], inplace=True)

# Verify missing values have been handled
print("\nNull Values After Cleaning:\n", df.isnull().sum())



Null Values After Cleaning:
 Unnamed: 0    0
statement     0
status        0
dtype: int64


In [8]:
# Define a function to clean text
def clean_text(text):
    import re
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply the cleaning function to the 'statement' column
df['cleaned_statement'] = df['statement'].apply(clean_text)

# Display cleaned statements
print("\nCleaned Statements:\n", df[['statement', 'cleaned_statement']].head())



Cleaned Statements:
                                            statement  \
0                                         oh my gosh   
1  trouble sleeping, confused mind, restless hear...   
2  All wrong, back off dear, forward doubt. Stay ...   
3  I've shifted my focus to something else but I'...   
4  I'm restless and restless, it's been a month n...   

                                   cleaned_statement  
0                                         oh my gosh  
1  trouble sleeping confused mind restless heart ...  
2  all wrong back off dear forward doubt stay in ...  
3  i ve shifted my focus to something else but i ...  
4  i m restless and restless it s been a month no...  


In [9]:
from textblob import TextBlob

In [10]:
# Define a function to calculate sentiment polarity
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply the function to calculate sentiment score
df['sentiment_score'] = df['cleaned_statement'].apply(get_sentiment)

# Display sentiment scores
print("\nSentiment Scores:\n", df[['cleaned_statement', 'sentiment_score']].head())



Sentiment Scores:
                                    cleaned_statement  sentiment_score
0                                         oh my gosh           0.0000
1  trouble sleeping confused mind restless heart ...          -0.3000
2  all wrong back off dear forward doubt stay in ...          -0.2500
3  i ve shifted my focus to something else but i ...           0.0000
4  i m restless and restless it s been a month no...          -0.3125


In [11]:
# Define a function to classify sentiment
def classify_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to classify sentiment
df['sentiment'] = df['sentiment_score'].apply(classify_sentiment)

# Display classified sentiments
print("\nClassified Sentiments:\n", df[['cleaned_statement', 'sentiment', 'sentiment_score']].head())



Classified Sentiments:
                                    cleaned_statement sentiment  \
0                                         oh my gosh   Neutral   
1  trouble sleeping confused mind restless heart ...  Negative   
2  all wrong back off dear forward doubt stay in ...  Negative   
3  i ve shifted my focus to something else but i ...   Neutral   
4  i m restless and restless it s been a month no...  Negative   

   sentiment_score  
0           0.0000  
1          -0.3000  
2          -0.2500  
3           0.0000  
4          -0.3125  


In [13]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
     ------------------------------------ 126.0/126.0 kB 494.5 kB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2



[notice] A new release of pip available: 22.2.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function for VADER sentiment analysis
def vader_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound'], scores['pos'], scores['neu'], scores['neg']

# Apply VADER analysis
df['vader_compound'], df['vader_positive'], df['vader_neutral'], df['vader_negative'] = zip(
    *df['cleaned_statement'].apply(vader_sentiment)
)

# Display results
print("\nVADER Sentiment Analysis:\n", df[['cleaned_statement', 'vader_compound', 'vader_positive', 'vader_neutral', 'vader_negative']].head())



VADER Sentiment Analysis:
                                    cleaned_statement  vader_compound  \
0                                         oh my gosh          0.0000   
1  trouble sleeping confused mind restless heart ...         -0.2263   
2  all wrong back off dear forward doubt stay in ...         -0.7351   
3  i ve shifted my focus to something else but i ...         -0.4215   
4  i m restless and restless it s been a month no...         -0.4939   

   vader_positive  vader_neutral  vader_negative  
0           0.000          1.000           0.000  
1           0.243          0.347           0.410  
2           0.121          0.421           0.458  
3           0.000          0.811           0.189  
4           0.000          0.769           0.231  


In [16]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
     -------------------------------------- 10.0/10.0 MB 148.8 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
     ------------------------------------ 447.6/447.6 kB 120.6 kB/s eta 0:00:00
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.5-cp310-none-win_amd64.whl (285 kB)
     ------------------------------------ 285.9/285.9 kB 114.6 kB/s eta 0:00:00
Collecting tokenizers<0.21,>=0.20
  Downloading tokenizers-0.20.3-cp310-none-win_amd64.whl (2.4 MB)
     ---------------------------------------- 2.4/2.4 MB 101.8 kB/s eta 0:00:00
Collecting fsspec>=2023.5.0
  Downloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
     ------------------------------------ 179.6/179.6 kB 135.5 kB/s eta 0:00:00
Installing collected packages: safetensors, fsspec, huggingface-hub, tokenizers, transformers
Successfully installed fsspec-2024.10


[notice] A new release of pip available: 22.2.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize text data
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_statement']).toarray()

# Prepare labels (Assume 'status' contains the sentiment labels)
y = df['status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Classification Report:
                       precision    recall  f1-score   support

             Anxiety       0.83      0.75      0.79       755
             Bipolar       0.88      0.70      0.78       527
          Depression       0.69      0.74      0.72      3016
              Normal       0.86      0.95      0.90      3308
Personality disorder       0.87      0.43      0.58       237
              Stress       0.71      0.50      0.58       536
            Suicidal       0.71      0.67      0.69      2158

            accuracy                           0.77     10537
           macro avg       0.79      0.68      0.72     10537
        weighted avg       0.77      0.77      0.77     10537



In [19]:
# Define a custom lexicon
custom_lexicon = {
    'happy': 1, 'joy': 1, 'sad': -1, 'anxious': -1, 'relaxed': 1, 'depressed': -1
}

# Function to calculate sentiment score based on custom lexicon
def lexicon_sentiment(text):
    score = 0
    words = text.split()
    for word in words:
        score += custom_lexicon.get(word, 0)  # Default score is 0 for unknown words
    return score

# Apply lexicon-based sentiment analysis
df['lexicon_sentiment'] = df['cleaned_statement'].apply(lexicon_sentiment)

# Display results
print("\nLexicon Sentiment Analysis:\n", df[['cleaned_statement', 'lexicon_sentiment']].head())
# Define a custom lexicon
custom_lexicon = {
    'happy': 1, 'joy': 1, 'sad': -1, 'anxious': -1, 'relaxed': 1, 'depressed': -1
}

# Function to calculate sentiment score based on custom lexicon
def lexicon_sentiment(text):
    score = 0
    words = text.split()
    for word in words:
        score += custom_lexicon.get(word, 0)  # Default score is 0 for unknown words
    return score

# Apply lexicon-based sentiment analysis
df['lexicon_sentiment'] = df['cleaned_statement'].apply(lexicon_sentiment)

# Display results
print("\nLexicon Sentiment Analysis:\n", df[['cleaned_statement', 'lexicon_sentiment']].head())



Lexicon Sentiment Analysis:
                                    cleaned_statement  lexicon_sentiment
0                                         oh my gosh                  0
1  trouble sleeping confused mind restless heart ...                  0
2  all wrong back off dear forward doubt stay in ...                  0
3  i ve shifted my focus to something else but i ...                  0
4  i m restless and restless it s been a month no...                  0

Lexicon Sentiment Analysis:
                                    cleaned_statement  lexicon_sentiment
0                                         oh my gosh                  0
1  trouble sleeping confused mind restless heart ...                  0
2  all wrong back off dear forward doubt stay in ...                  0
3  i ve shifted my focus to something else but i ...                  0
4  i m restless and restless it s been a month no...                  0


In [21]:
# Define rules
def rule_based_sentiment(text):
    if 'not good' in text or 'bad' in text:
        return 'Negative'
    elif 'great' in text or 'good' in text:
        return 'Positive'
    else:
        return 'Neutral'

# Apply rule-based sentiment analysis
df['rule_based_sentiment'] = df['cleaned_statement'].apply(rule_based_sentiment)

# Display results
print("\nRule-Based Sentiment Analysis:\n", df[['cleaned_statement', 'rule_based_sentiment']].head())





Rule-Based Sentiment Analysis:
                                    cleaned_statement rule_based_sentiment
0                                         oh my gosh              Neutral
1  trouble sleeping confused mind restless heart ...              Neutral
2  all wrong back off dear forward doubt stay in ...              Neutral
3  i ve shifted my focus to something else but i ...              Neutral
4  i m restless and restless it s been a month no...              Neutral


In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming df is your dataframe with a 'cleaned_statement' and 'status' column

# Preprocessing
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_statement'])
X = tokenizer.texts_to_sequences(df['cleaned_statement'])
X = pad_sequences(X, maxlen=100)

# Encode the labels
num_classes = len(df['status'].unique())
y = pd.get_dummies(df['status']).values

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')  # Updated to match the number of classes
])

# Compile Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Predict and Evaluate
predictions = model.predict(X_test)
print("\nLSTM Predictions:\n", predictions)
 


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

LSTM Predictions:
 [[4.82181029e-04 1.57473347e-04 1.29887706e-03 ... 1.05253508e-04
  1.61512871e-04 3.47547466e-04]
 [2.09757229e-04 2.39789821e-04 9.92298126e-04 ... 3.83309525e-04
  8.61486697e-05 4.35040973e-04]
 [1.20682886e-03 9.96161580e-01 1.86011381e-03 ... 4.57796152e-04
  2.16886416e-04 4.64421064e-05]
 ...
 [7.30681495e-05 2.18324582e-04 8.84424210e-01 ... 6.76309646e-05
  4.35286274e-05 1.15138784e-01]
 [8.52357130e-04 3.23792687e-03 8.58530045e-01 ... 8.50513112e-04
  2.35125865e-03 1.31450042e-01]
 [3.89861455e-03 2.94903014e-02 8.68626013e-02 ... 8.35646331e-01
  2.99037900e-03 4.04080115e-02]]
