In [1]:
# from google.colab import files
# uploaded = files.upload()


In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    print("Downloading necessary NLTK data...")
    nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4', 'averaged_perceptron_tagger_eng'], quiet=True)
    print("NLTK data download complete.")

Downloading necessary NLTK data...
NLTK data download complete.


In [4]:
tweet_tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

In [5]:
def load_data(data_path='../data/twitter_training.csv', text_column='Tweet'):
    """Loads the dataset and prepares the DataFrame."""
    try:
        df = pd.read_csv(data_path, header=None, encoding='latin1')
        df.columns = ['ID', 'Entity', 'Sentiment', text_column]
        df = df.dropna(subset=[text_column]).reset_index(drop=True)
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at {data_path}.")
        return None

In [6]:
def get_wordnet_pos(tag):
    """Map NLTK POS tags to WordNet POS tags"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
def extract_features(text):
    hashtags = re.findall(r'#(\w+)', text)
    mentions = re.findall(r'@(\w+)', text)
    return ' '.join(hashtags), ' '.join(mentions)

In [8]:
def clean_and_tokenize(text):
    if pd.isna(text) or not isinstance(text, str):
        return []

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    tokens = tweet_tokenizer.tokenize(text)

    return tokens

In [9]:
def lemmatize_tokens(tokens):
    """Lemmatizes tokens using POS tagging for context-aware normalization."""
    tagged_tokens = nltk.pos_tag(tokens)

    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        wntag = get_wordnet_pos(tag)
        if isinstance(word, str):
            lemma = lemmatizer.lemmatize(word, pos=wntag)
            lemmatized_tokens.append(lemma)

    return lemmatized_tokens

In [10]:
def preprocess_data(df, text_column='Tweet'):
    df[text_column] = df[text_column].fillna('')

    df['hashtags'], df['mentions'] = zip(*df[text_column].apply(extract_features))

    df['tokens'] = df[text_column].apply(clean_and_tokenize)

    df['lemmas'] = df['tokens'].apply(lemmatize_tokens)

    df['processed_text'] = df['lemmas'].apply(lambda x: ' '.join(x))

    return df

In [11]:
def vectorize_data(df, text_column='processed_text'):

    tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)

    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])

    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    return tfidf_df, tfidf_vectorizer

In [None]:
if __name__ == '__main__':
    TEXT_COLUMN = 'Tweet'

    print("--- 1. Loading Data ---")
    df = load_data(text_column=TEXT_COLUMN)

    if df is None:
        print("Exiting script due to data loading error.")
        exit()

    df_sample = df.head(100).copy()

    print(f"Successfully loaded {len(df)} rows. Processing a sample of {len(df_sample)} rows.")
    print("\n--- Original Data Sample (First 5 rows) ---")
    print(df_sample[[TEXT_COLUMN]].head().to_markdown(index=False))

    print("\n--- 2. Applying Preprocessing Pipeline ---")
    processed_df = preprocess_data(df_sample, text_column=TEXT_COLUMN)

    print("\n--- 3. Processed Data Sample (First 5 rows) ---")
    print(processed_df[[TEXT_COLUMN, 'hashtags', 'mentions', 'processed_text']].head().to_markdown(index=False))

    print("\n--- 4. Applying TF-IDF Vectorization (sublinear_tf=True) ---")
    tfidf_df, vectorizer = vectorize_data(processed_df)

    print(f"\nTotal features (vocabulary size): {len(vectorizer.get_feature_names_out())}")

    print("\n--- 5. TF-IDF Vectorization Sample (First 5 rows, first 10 features) ---")

    feature_names = vectorizer.get_feature_names_out()[:10]
    tfidf_sample_output = tfidf_df.iloc[:5, :10]
    tfidf_sample_output.columns = feature_names
    tfidf_sample_output.index = [f"Doc {i+1}" for i in range(5)]

    print(tfidf_sample_output.to_markdown())

    print("\n--- Script Execution Complete ---")

--- 1. Loading Data ---
Error: Data file not found at ../data/twitter_training.csv.
Exiting script due to data loading error.


AttributeError: 'NoneType' object has no attribute 'head'

: 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc',
            'smoke', 'alco', 'active', 'gender', 'cardio']
corr_matrix = df_clean[features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix,
            annot=True,
            fmt=".2f",
            cmap='coolwarm',
            cbar=True,
            square=True,
            linewidths=.5,
            linecolor='black')
plt.title('Correlation Matrix of Clinical Features and Cardiovascular Disease')
plt.show()

print("\nCorrelation with Cardiovascular Disease (cardio):")
print(corr_matrix['cardio'].sort_values(ascending=False))

In [None]:
import zipfile
import os

zip_file_path = '/content/cardio_train.csv.zip'

extraction_dir = '/content/'

os.makedirs(extraction_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_dir)

print(f"'{zip_file_path}' unzipped to '{extraction_dir}'")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc',
            'smoke', 'alco', 'active', 'gender', 'cardio']

# Ensure all specified features exist in the DataFrame
missing_features = [f for f in features if f not in df_clean.columns]
if missing_features:
    print(f"Warning: The following features are missing from df_clean: {missing_features}")
    # Filter out missing features for correlation calculation
    available_features = [f for f in features if f in df_clean.columns]
else:
    available_features = features

corr_matrix = df_clean[available_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix,
            annot=True,
            fmt=".2f",
            cmap='coolwarm',
            cbar=True,
            square=True,
            linewidths=.5,
            linecolor='black')
plt.title('Correlation Matrix of Clinical Features and Cardiovascular Disease')
plt.show()

print("\nCorrelation with Cardiovascular Disease (cardio):")
if 'cardio' in available_features:
    print(corr_matrix['cardio'].sort_values(ascending=False))
else:
    print(" 'cardio' feature is not available for correlation calculation.")

In [None]:
df_eda = df_clean.copy()
df_eda['age_years'] = (df_eda['age'] / 365.25).round().astype(int)

min_age = df_eda['age_years'].min()
max_age = df_eda['age_years'].max()
bins = range(min_age, max_age + 6, 5)

df_eda['age_bin'] = pd.cut(df_eda['age_years'],
                           bins=bins,
                           right=False,
                           labels=[f'{i}-{i+4}' for i in bins[:-1]])

prevalence_by_age = df_eda.groupby('age_bin', observed=False)['cardio'].agg(
    total_patients=('size'),
    disease_prevalence=('mean')
).reset_index()

print("Disease Prevalence by 5-Year Age Bin:")
print(prevalence_by_age)

plt.figure(figsize=(10, 6))
sns.barplot(x='age_bin', y='disease_prevalence', data=prevalence_by_age, palette='viridis')
plt.title('Cardiovascular Disease Prevalence by 5-Year Age Group')
plt.xlabel('Age Bin (Years)')
plt.ylabel('Disease Prevalence (Ratio)')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

In [None]:
def categorize_bp(ap_hi, ap_lo):
    if ap_hi < 120 and ap_lo < 80:
        return 'Normal'
    elif (ap_hi >= 140 or ap_lo >= 90):
        return 'Stage 2 Hypertension'
    elif (ap_hi >= 130 and ap_hi < 140) or (ap_lo >= 80 and ap_lo < 90):
        return 'Stage 1 Hypertension'
    elif (ap_hi >= 120 and ap_hi < 130) and (ap_lo < 80):
        return 'Elevated/Pre-Hypertension'
    else:
        return 'Stage 1 Hypertension'

df_eda['bp_category'] = df_eda.apply(
    lambda row: categorize_bp(row['ap_hi'], row['ap_lo']),
    axis=1
)

plt.figure(figsize=(8, 6))
bp_counts = df_eda['bp_category'].value_counts().sort_index()
sns.barplot(x=bp_counts.index, y=bp_counts.values, palette='plasma')
plt.title('Patient Distribution by Blood Pressure Category')
plt.xlabel('Blood Pressure Category')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
import plotly.express as px

fig = px.scatter(df_eda,
                 x='age_years',
                 y='weight',
                 color='cardio',
                 color_discrete_map={0: 'blue', 1: 'red'},
                 title='Age vs. Weight by Cardiovascular Disease Status',
                 labels={'cardio': 'Cardio Disease (1=Yes, 0=No)'},
                 hover_data=['ap_hi', 'ap_lo', 'cholesterol'])

fig.show()

In [None]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df_eda['cholesterol'], df_eda['cardio'])

chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print("Chi-Square Test: Cholesterol vs. Cardiovascular Disease")
print(f"Chi-square statistic: {chi2:.2f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Conclusion: The relationship between cholesterol level and cardiovascular disease is statistically significant.")
else:
    print("Conclusion: The relationship is NOT statistically significant.")