In [2]:
!pip install nltk spacy scikit-learn
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.7.5-cp310-cp310-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downlo

In [3]:
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Example health-related post content
post_content = """
Regular exercise is essential for maintaining overall health. It helps control weight, reduces the risk of heart disease,
and strengthens bones and muscles. Additionally, physical activity can improve mental health by reducing anxiety, depression,
and negative mood. Incorporating a balanced diet rich in fruits and vegetables also supports a healthy lifestyle.
"""

# Preprocess the text
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    return text

preprocessed_text = preprocess_text(post_content)

# Function to extract keywords using TF-IDF
def extract_keywords(text, num_keywords=10):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Get top n keywords based on TF-IDF scores
    sorted_indices = tfidf_scores.argsort()[::-1]
    top_keywords = [feature_names[idx] for idx in sorted_indices[:num_keywords]]
    return top_keywords

# Extract keywords
keywords = extract_keywords(preprocessed_text)
print("Keywords:", keywords)

# Function to extract named entities
def extract_entities(text):
    doc = nlp(text)
    entities = [entity.text for entity in doc.ents if entity.label_ in ['PERSON', 'ORG', 'GPE', 'DATE', 'TIME', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']]
    return entities

# Extract entities
entities = extract_entities(post_content)
print("Entities:", entities)

# Combine keywords and entities
tags = list(set(keywords + entities))
print("Tags:", tags)

# Health-specific filtering (if needed)
health_specific_tags = [tag for tag in tags if tag not in ['time', 'date', 'money']]  # Example of filtering out non-health-related tags

# Add manual tags if necessary
manual_tags = ['health', 'exercise', 'diet', 'mental health']
final_tags = list(set(health_specific_tags + manual_tags))

print("Final Tags:", final_tags)

Keywords: ['health', 'weight', 'disease', 'heart', 'healthy', 'fruits', 'exercise', 'essential', 'diet', 'improve']
Entities: []
Tags: ['improve', 'essential', 'exercise', 'healthy', 'weight', 'heart', 'disease', 'diet', 'health', 'fruits']
Final Tags: ['improve', 'essential', 'exercise', 'healthy', 'weight', 'heart', 'disease', 'diet', 'mental health', 'health', 'fruits']


In [6]:
!pwd

/Users/srividhyaleeladharan/code/project_h4y/notebooks


In [7]:
import pandas as pd
data=pd.read_csv('../raw_data/fake_posts.csv')

In [9]:
data['Content']=data['Content'].apply(preprocess_text)

In [10]:
data.head()

Unnamed: 0,Title,Content,Category,Tags
0,Let's Talk about the Importance of Foot Care\n,hey there health enthusiasts today lets chat a...,diabetes,!Checkups !FootCare !HeartDiet !Vaccines !Type...
1,Random Health Benefits of Practicing Tai Chi\n\n,hey guys its your favorite health and lifestyl...,generic,!Exercise !Vaccines !Resilience !Cholesterol !...
2,Get Your Heart Pumping with These Seasonal Fit...,hey there fitness enthusiasts today im here to...,diabetes,!Fitness !Seasonal !DiabetesPrevention !Exerci...
3,The Power of Herbal Teas for Optimal Health\n,hey health enthusiast friends today i want to ...,diabetes,!Hydration !Glucose !Fitness !Vitamins !Cardio
4,The Ins and Outs of Proper Handwashing\n,yo yo yo what up my peeps today i wanna talk t...,generic,!Stress !FootCare !Happiness !PhysicalTherapy ...


In [20]:
data.head()

Unnamed: 0,Title,Content,Category,Tags,keywords,entities,tags
0,Let's Talk about the Importance of Foot Care\n,hey there health enthusiasts today lets chat a...,diabetes,!Checkups !FootCare !HeartDiet !Vaccines !Type...,"[lets, health, foot, care, crucial, managing, ...","[today, 2, omega3, 2]","[diabetes, managing, checkups, crucial, today,..."
1,Random Health Benefits of Practicing Tai Chi\n\n,hey guys its your favorite health and lifestyl...,generic,!Exercise !Vaccines !Resilience !Cholesterol !...,"[tai, chi, improve, help, health, lets, way, e...","[today, first, tai chi]","[improve, exercise, stay, talk, today, tai, le..."
2,Get Your Heart Pumping with These Seasonal Fit...,hey there fitness enthusiasts today im here to...,diabetes,!Fitness !Seasonal !DiabetesPrevention !Exerci...,"[active, exercise, fall, like, fitness, blood,...","[today, summer fall, winter, spring, daily, on...","[blood, daily, like, exercise, fall, staying, ..."
3,The Power of Herbal Teas for Optimal Health\n,hey health enthusiast friends today i want to ...,diabetes,!Hydration !Glucose !Fitness !Vitamins !Cardio,"[teas, herbal, tea, body, health, like, levels...","[today, daily]","[daily, like, today, levels, vitamins, cardio,..."
4,The Ins and Outs of Proper Handwashing\n,yo yo yo what up my peeps today i wanna talk t...,generic,!Stress !FootCare !Happiness !PhysicalTherapy ...,"[hands, handwashing, know, youre, germs, yo, y...","[yo yo yo, today, first, first, at least 20 se...","[yo yo yo, at least 20 seconds, germs, aint, c..."


In [11]:
data['keywords']=data['Content'].apply(extract_keywords)

In [12]:
data['entities']=data['Content'].apply(extract_entities)

In [19]:
def tags(keywords, entities):
    return list(set(keywords + entities))

# Apply the tags function to create the tags column
data['tags'] = data.apply(lambda row: tags(row['keywords'], row['entities']), axis=1)


In [None]:
'distress', 'tension','anxiety', 'fatigue', 'pressure', 'relaxation','Calm','Peaceful','Tranquil','Serene','Composed','Carefree','Easygoing','Laidback','Meditation','Yoga','Vacation','Spa','Massage','Reading','NatureWalks','Bathing','BreathingExercises','Mindfulness','Aromatherapy','Unwind','glucose','diabetes','nutrition','insulin', 'hypoglycaemia','chest pain','heartattack', 'heart','CVD', 'blood pressure', 'heart failure', 'heart attack', 'pain', 'fracture', 'back pain', 'joint pain', 'trauma', 'bone', 'bone density', 'bone loss', 'muscle pain', 'Sports', 'injury','cardiovascular diseases','IntermittentFasting','KetoDiet','Superfoods','OrganicFoods','Vegan','nutrition','worklifebalance','hygienepractices','acupuncture','minimalism'

In [41]:
def health_specific_tags(tags):
    t=['vaccines','therapy','muscles','routine','wellbeing','selfcare','bones','fracture','pain','arthritis','cholesterol','stress','mindfulness','depression','mood','osteoporosis','challenges','lifestyle','healthy','diet','greens','wellbeing','handwashing','cleanliness','aging','vitamins','joints','health','calcium','insulin','workout','cardio','exercise','checkups','recovery','distress', 'tension','anxiety', 'fatigue', 'pressure', 'relaxation','Calm','Peaceful','Tranquil','Serene','Composed','Carefree','Easygoing','Laidback','Meditation','Yoga','Vacation','Spa','Massage','Reading','NatureWalks','Bathing','BreathingExercises','Mindfulness','Aromatherapy','Unwind','glucose','diabetes','nutrition','insulin', 'hypoglycaemia','chest pain','heartattack', 'heart','CVD', 'blood pressure', 'heart failure', 'heart attack', 'pain', 'fracture', 'back pain', 'joint pain', 'trauma', 'bone', 'bone density', 'bone loss', 'muscle pain', 'Sports', 'injury','cardiovascular diseases','IntermittentFasting','KetoDiet','Superfoods','OrganicFoods','Vegan','nutrition','worklifebalance','hygienepractices','acupuncture','minimalism']
    t_lower=[tag.lower() for tag in t]
    tag_list= [tag for tag in tags if tag in t ]
    return tag_list

In [42]:
data['final_tags']=data['tags'].apply(health_specific_tags)

In [35]:
data.tags

0      [diabetes, managing, checkups, crucial, today,...
1      [improve, exercise, stay, talk, today, tai, le...
2      [blood, daily, like, exercise, fall, staying, ...
3      [daily, like, today, levels, vitamins, cardio,...
4      [yo yo yo, at least 20 seconds, germs, aint, c...
                             ...                        
994    [calcium, joint, daily, like, prevent, one, he...
995    [diabetes, 1, time, body, sugar, youre, surger...
996    [aging, inside, today, vitamins, one, summer, ...
997    [improve, calcium, great, involved, insulin, t...
998    [rate, roller, recovery, workout, foam, today,...
Name: tags, Length: 999, dtype: object

In [43]:
data

Unnamed: 0,Title,Content,Category,Tags,keywords,entities,tags,final_tags
0,Let's Talk about the Importance of Foot Care\n,hey there health enthusiasts today lets chat a...,diabetes,!Checkups !FootCare !HeartDiet !Vaccines !Type...,"[lets, health, foot, care, crucial, managing, ...","[today, 2, omega3, 2]","[diabetes, managing, checkups, crucial, today,...","[diabetes, checkups, diet, health]"
1,Random Health Benefits of Practicing Tai Chi\n\n,hey guys its your favorite health and lifestyl...,generic,!Exercise !Vaccines !Resilience !Cholesterol !...,"[tai, chi, improve, help, health, lets, way, e...","[today, first, tai chi]","[improve, exercise, stay, talk, today, tai, le...","[exercise, health]"
2,Get Your Heart Pumping with These Seasonal Fit...,hey there fitness enthusiasts today im here to...,diabetes,!Fitness !Seasonal !DiabetesPrevention !Exerci...,"[active, exercise, fall, like, fitness, blood,...","[today, summer fall, winter, spring, daily, on...","[blood, daily, like, exercise, fall, staying, ...",[exercise]
3,The Power of Herbal Teas for Optimal Health\n,hey health enthusiast friends today i want to ...,diabetes,!Hydration !Glucose !Fitness !Vitamins !Cardio,"[teas, herbal, tea, body, health, like, levels...","[today, daily]","[daily, like, today, levels, vitamins, cardio,...","[vitamins, cardio, health]"
4,The Ins and Outs of Proper Handwashing\n,yo yo yo what up my peeps today i wanna talk t...,generic,!Stress !FootCare !Happiness !PhysicalTherapy ...,"[hands, handwashing, know, youre, germs, yo, y...","[yo yo yo, today, first, first, at least 20 se...","[yo yo yo, at least 20 seconds, germs, aint, c...",[handwashing]
...,...,...,...,...,...,...,...,...
994,The Importance of Lavender Oil for Joint Health\n,as a medical professional i cannot stress enou...,skeleton,,"[joint, lavender, oil, joints, help, health, c...","[one, daily]","[calcium, joint, daily, like, prevent, one, he...","[calcium, joints, health]"
995,Be the Boss of Your Health with These 5 Lifest...,hey there health warriors if youre looking to ...,diabetes,,"[lets, health, hydration, youre, time, sugar, ...","[2, 1, 5]","[diabetes, 1, time, body, sugar, youre, surger...","[diabetes, health]"
996,From Frail to Fabulous: A Guide to Aging Grace...,hey there all you fabulous individuals today l...,skeleton,,"[aging, lets, vitamins, osteoporosis, care, ag...","[today, one, summer]","[aging, inside, today, vitamins, one, summer, ...","[aging, vitamins, osteoporosis, injury]"
997,The Surprising Health Benefits of Gardening\n\n,hey everyone today i want to talk about the in...,heart,,"[gardening, youre, garden, improve, help, grea...",[today],"[improve, calcium, great, involved, insulin, t...","[calcium, insulin]"


In [40]:
data.to_csv('../raw_data/tags.csv')