### Data Cleaning
(Removing Duplicates, Incomplete Rows, Stopwords)

In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords (if you haven't already)
nltk.download('stopwords')
nltk.download('punkt')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab')  # This line is added to download the necessary data

# Load the dataset
df = pd.read_csv('mental_health.csv')

# 1. Handling Missing Values
# Remove rows with missing text
df.dropna(subset=['text'], inplace=True)

# 2. Removing duplicates (if any)
df.drop_duplicates(subset=['text'], inplace=True)

# 3. Text Preprocessing

# Define a list of negative words to retain
negative_words = {"not", "no", "nor", "never", "nothing", "nowhere", "neither", "cannot", "n't", "without", "barely", "hardly", "scarcely"}

# Define a function to clean the text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove special characters, numbers, and punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords, but keep negative words
    tokens = [word for word in tokens if word not in stopwords.words('english') or word in negative_words]

    # Join the tokens back into a single string
    clean_text = ' '.join(tokens)

    return clean_text

# Apply the cleaning function to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_text)

# 4. Feature Extraction using TF-IDF Vectorization

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust the max_features

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(df['cleaned_text'])

# Convert the result to a DataFrame for easier understanding (optional)
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# You now have a cleaned and vectorized dataset.
print(X_df.head())

# Save the preprocessed dataset (optional)
df.to_csv('preprocessed_mental_health.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


    aa  aaa  aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa   ab  abandon  abandoned  \
0  0.0  0.0                                 0.0  0.0      0.0        0.0   
1  0.0  0.0                                 0.0  0.0      0.0        0.0   
2  0.0  0.0                                 0.0  0.0      0.0        0.0   
3  0.0  0.0                                 0.0  0.0      0.0        0.0   
4  0.0  0.0                                 0.0  0.0      0.0        0.0   

   abandoning  abandonment  abc  abdomen  ...  zero  zoloft  zombie  zombies  \
0         0.0          0.0  0.0      0.0  ...   0.0     0.0     0.0      0.0   
1         0.0          0.0  0.0      0.0  ...   0.0     0.0     0.0      0.0   
2         0.0          0.0  0.0      0.0  ...   0.0     0.0     0.0      0.0   
3         0.0          0.0  0.0      0.0  ...   0.0     0.0     0.0      0.0   
4         0.0          0.0  0.0      0.0  ...   0.0     0.0     0.0      0.0   

   zone  zones  zoning  zoom  zuckerberg  zyprexa  
0   0.0   

In [None]:
!jupyter nbconvert --to html 02_Data_Preprocessing.ipynb
#

[NbConvertApp] Converting notebook 02_Data_Preprocessing.ipynb to html
[NbConvertApp] Writing 280507 bytes to 02_Data_Preprocessing.html
