In [None]:

# Import necessary libraries and download NLTK data
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)


True

In [None]:
# Install required packages
!pip install -q spacy textstat wordfreq emoji sentence-transformers


In [None]:
import pandas as pd
import numpy as np
import re
import emoji
import spacy
import textstat
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from collections import Counter
from wordfreq import word_frequency
from sentence_transformers import SentenceTransformer, util
import nltk

In [None]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Load English NLP model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:

# file upload
df = pd.read_csv("sm_data.csv")

# Basic feature functions
def basic_counts(text):
    words = word_tokenize(text)
    sentences = re.split(r'[.!?]+', text)
    return len(words), len(sentences), np.mean([len(s.split()) for s in sentences if s.strip()] or [0])

def lexical_diversity(text):
    words = word_tokenize(text.lower())
    unique_words = set(words)
    content_words = [w for w in words if w.isalpha() and w not in stopwords.words('english')]
    return len(unique_words) / len(words) if words else 0, len(content_words) / len(words) if words else 0

def readability_scores(text):
    return textstat.flesch_reading_ease(text), textstat.gunning_fog(text), textstat.smog_index(text), textstat.dale_chall_readability_score(text)

def syntactic_complexity(text):
    doc = nlp(text)
    tree_depths = [token.head.i - token.i for token in doc if token.dep_ != 'punct']
    noun_phrases = list(doc.noun_chunks)
    return len(noun_phrases), np.mean(tree_depths) if tree_depths else 0

def word_info(text):
    words = word_tokenize(text)
    syllables = [textstat.syllable_count(w) for w in words]
    freqs = [word_frequency(w, 'en') for w in words if w.isalpha()]
    return np.mean(syllables) if syllables else 0, np.mean(freqs) if freqs else 0

def tweet_specific(text):
    hashtags = len(re.findall(r"#\w+", text))
    mentions = len(re.findall(r"@\w+", text))
    emojis = emoji.emoji_count(text)
    return hashtags, mentions, emojis

def psycholinguistic_scores(text):
    # Proxy scores using concreteness (word length, familiarity, etc.)
    words = [w for w in word_tokenize(text.lower()) if w.isalpha()]
    avg_len = np.mean([len(w) for w in words]) if words else 0
    return avg_len

def semantic_similarity(text):
    sentences = re.split(r'[.!?]+', text)
    if len(sentences) < 2:
        return 0
    embeddings = model.encode(sentences, convert_to_tensor=True)
    sim = util.pytorch_cos_sim(embeddings[:-1], embeddings[1:]).mean().item()
    return sim

# Extraction loop
features = []
for text in df['text']:
    wc, sc, asl = basic_counts(text)
    ttr, cwr = lexical_diversity(text)
    fre, gfi, smog, dale = readability_scores(text)
    np_count, tree_depth = syntactic_complexity(text)
    avg_syll, avg_freq = word_info(text)
    ht_count, mention_count, emoji_count = tweet_specific(text)
    avg_word_len = psycholinguistic_scores(text)
    semantic_sim = semantic_similarity(text)

    features.append({
        'word_count': wc,
        'sentence_count': sc,
        'avg_sentence_length': asl,
        'type_token_ratio': ttr,
        'content_word_ratio': cwr,
        'flesch_reading_ease': fre,
        'gunning_fog_index': gfi,
        'smog_index': smog,
        'dale_chall_score': dale,
        'noun_phrase_count': np_count,
        'avg_tree_depth': tree_depth,
        'avg_syllables_per_word': avg_syll,
        'avg_word_frequency': avg_freq,
        'hashtag_count': ht_count,
        'mention_count': mention_count,
        'emoji_count': emoji_count,
        'avg_word_length': avg_word_len,
        'semantic_similarity': semantic_sim
    })

# Combine with original
features_df = pd.DataFrame(features)
result_df = pd.concat([df, features_df], axis=1)

# Save output
result_df.to_csv("sm_data_with_features.csv", index=False)
print("Feature extraction complete! Saved to 'sm_data_with_features.csv'")


✅ Feature extraction complete! Saved to 'sm_data_with_features.csv'


**COH-MATRIX ANALYSIS**

**1.Discriptives Features**

In [None]:

#!pip install spacy
#!python -m spacy download en_core_web_sm


In [None]:

import pandas as pd
import numpy as np
import spacy

# Load model
nlp = spacy.load("en_core_web_sm")

# Data Loading
df = pd.read_csv("sm_data.csv")

def full_descriptive_features(text):
    text = str(text)
    doc = nlp(text)
    words = [token.text for token in doc if token.is_alpha]
    word_lengths = [len(word) for word in words]
    sentences = list(doc.sents)
    sentence_lengths = [len([token for token in sent if token.is_alpha]) for sent in sentences]
    paragraphs = text.split('\n')
    content_words = [token.text for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']]
    function_words = [token.text for token in doc if token.pos_ in ['ADP', 'PRON', 'CONJ', 'DET', 'CCONJ', 'SCONJ']]

    return {
        'char_count': len(text),
        'letter_count': sum(c.isalpha() for c in text),
        'word_count': len(words),
        'unique_word_count': len(set(words)),
        'content_word_count': len(content_words),
        'function_word_count': len(function_words),
        'sentence_count': len(sentences),
        'paragraph_count': len(paragraphs),
        'token_count': len(doc),
        'min_sentence_length': min(sentence_lengths) if sentence_lengths else 0,
        'max_sentence_length': max(sentence_lengths) if sentence_lengths else 0,
        'avg_sentence_length': np.mean(sentence_lengths) if sentence_lengths else 0,
        'min_word_length': min(word_lengths) if word_lengths else 0,
        'max_word_length': max(word_lengths) if word_lengths else 0,
        'avg_word_length': np.mean(word_lengths) if word_lengths else 0,
        'median_word_length': np.median(word_lengths) if word_lengths else 0,
        'std_word_length': np.std(word_lengths) if word_lengths else 0,
        'capitalized_word_count': sum(1 for token in doc if token.is_alpha and token.text[0].isupper()),
        'punctuation_count': sum(1 for token in doc if token.is_punct),
        'comma_count': text.count(','),
        'exclamation_count': text.count('!'),
        'question_count': text.count('?'),
        'period_count': text.count('.'),
        'type_token_ratio': len(set(words)) / len(words) if words else 0,
        'avg_words_per_paragraph': len(words) / len(paragraphs) if paragraphs else 0,
        'avg_chars_per_word': sum(len(word) for word in words) / len(words) if words else 0,
        'lexical_density': len(content_words) / len(words) if words else 0
    }

# Apply
descriptive_features = df['text'].apply(full_descriptive_features)
descriptive_df = pd.DataFrame(descriptive_features.tolist())
result = pd.concat([df, descriptive_df], axis=1)

# Save and show
result.to_csv("Full_descriptive.csv", index=False)
print("Done! File saved as Full_data_descriptive.csv")


✅ Done! File saved as Full_data_descriptive.csv


**2.Lexical Diversity**

In [None]:
# 📦 Install required packages
!pip install -q spacy wordfreq lexicalrichness textstat
#!python -m spacy download en_core_web_sm

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.8/97.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for lexicalrichness (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import numpy as np
import spacy
from collections import Counter
from lexicalrichness import LexicalRichness
from wordfreq import word_frequency
import textstat

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("sm_data.csv")

# Function to extract 22 lexical diversity metrics
def extract_lexical_features(text):
    text = str(text)
    doc = nlp(text)
    words = [token.text for token in doc if token.is_alpha]
    lower_words = [w.lower() for w in words]
    pos_counts = Counter([token.pos_ for token in doc])
    total_words = len(words)
    sentences = list(doc.sents)
    num_sentences = len(sentences) if sentences else 1

    # POS tags
    noun_count = pos_counts["NOUN"]
    verb_count = pos_counts["VERB"]
    adj_count = pos_counts["ADJ"]
    adv_count = pos_counts["ADV"]
    content_count = noun_count + verb_count + adj_count + adv_count
    function_count = total_words - content_count

    # Ratios
    ttr = len(set(lower_words)) / total_words if total_words else 0
    noun_ratio = noun_count / total_words if total_words else 0
    verb_ratio = verb_count / total_words if total_words else 0
    adj_ratio = adj_count / total_words if total_words else 0
    adv_ratio = adv_count / total_words if total_words else 0
    content_ratio = content_count / total_words if total_words else 0
    function_ratio = function_count / total_words if total_words else 0

    # Densities
    noun_density = noun_count / num_sentences
    verb_density = verb_count / num_sentences
    adj_density = adj_count / num_sentences
    adv_density = adv_count / num_sentences
    content_density = content_count / num_sentences
    function_density = function_count / num_sentences

    # Avg features
    avg_word_len = np.mean([len(w) for w in words]) if words else 0
    avg_syllables = np.mean([textstat.syllable_count(w) for w in words]) if words else 0
    word_freq = np.mean([word_frequency(w, 'en') for w in words if w.isalpha()]) if words else 0

    # Lexical Richness (vocd & MLTD)
    lex = LexicalRichness(' '.join(words))
    try:
        mltd = lex.mltd(threshold=0.72)
        vocd = lex.vocab_diversity()
    except:
        mltd = 0
        vocd = 0

    # Hapax legomena & dislegomena
    freqs = Counter(lower_words)
    hapax_legomena = len([w for w, f in freqs.items() if f == 1])
    hapax_dislegomena = len([w for w, f in freqs.items() if f == 2])
    hapax_legomena_ratio = hapax_legomena / total_words if total_words else 0
    hapax_dislegomena_ratio = hapax_dislegomena / total_words if total_words else 0

    return {
        "type_token_ratio": ttr,
        "noun_token_ratio": noun_ratio,
        "verb_token_ratio": verb_ratio,
        "adj_token_ratio": adj_ratio,
        "adv_token_ratio": adv_ratio,
        "content_word_token_ratio": content_ratio,
        "function_word_token_ratio": function_ratio,
        "noun_density": noun_density,
        "verb_density": verb_density,
        "adj_density": adj_density,
        "adv_density": adv_density,
        "content_word_density": content_density,
        "function_word_density": function_density,
        "avg_word_length": avg_word_len,
        "avg_word_syllables": avg_syllables,
        "word_frequency_mean": word_freq,
        "mltd": mltd,
        "vocd_d": vocd,
        "hapax_legomena_ratio": hapax_legomena_ratio,
        "hapax_dislegomena_ratio": hapax_dislegomena_ratio,
        "open_class_word_ratio": content_ratio,
        "closed_class_word_ratio": function_ratio
    }

# Apply to all rows
lexical_features = df['text'].apply(extract_lexical_features)
lexical_df = pd.DataFrame(lexical_features.tolist())

# Combine and export
result = pd.concat([df, lexical_df], axis=1)
result.to_csv("Full_lexical_diversity.csv", index=False)
print("Done! Lexical diversity features saved to 'sm_data_lexical_features.csv'")


✅ Done! Lexical diversity features saved to 'sm_data_lexical_features.csv'


**3.Readability Features**

In [None]:
# Install required packages
!pip install -q textstat lexicalrichness


In [None]:


import pandas as pd
import numpy as np
import textstat
from lexicalrichness import LexicalRichness

# Load dataset
df = pd.read_csv("sm_data.csv")

# Readability extraction function (7 indicators)
def extract_readability_scores(text):
    text = str(text)
    lex = LexicalRichness(text)

    # 1–5: From textstat
    flesch_grade = textstat.flesch_kincaid_grade(text)
    fog_index = textstat.gunning_fog(text)
    smog = textstat.smog_index(text)

    # 6–7: From lexicalrichness or custom
    try:
        brunet_index = lex.brunet_index()
    except:
        brunet_index = 0

    try:
        honore_stat = lex.honore_stat()
    except:
        honore_stat = 0

    # Szigriszt-Pazos Perspicuity Index (approximation)
    total_words = textstat.lexicon_count(text, removepunct=True)
    syllables = textstat.syllable_count(text)
    sentences = textstat.sentence_count(text)
    try:
        szigriszt = 206.835 - (62.3 * (syllables / total_words)) - (sentences / total_words * 100)
    except:
        szigriszt = 0

    # Readability mean µ
    scores = [flesch_grade, fog_index, smog, brunet_index, honore_stat, szigriszt]
    readability_mean = np.mean([s for s in scores if s > 0]) if scores else 0

    return {
        "flesch_kincaid_grade": flesch_grade,
        "gunning_fog_index": fog_index,
        "smog_index": smog,
        "brunet_index": brunet_index,
        "honore_statistic": honore_stat,
        "szigriszt_pazos_index": szigriszt,
        "readability_mean_score": readability_mean
    }

# Apply to all rows
readability_features = df['text'].apply(extract_readability_scores)
readability_df = pd.DataFrame(readability_features.tolist())

# ➕ Combine and save
result = pd.concat([df, readability_df], axis=1)
result.to_csv("Full_readability_features.csv", index=False)
print(" Done! Saved to 'sm_data_readability_features.csv'")


✅ Done! Saved to 'sm_data_readability_features.csv'


**4.Syntactic Complexity**

In [None]:
# Install necessary packages
!pip install -q spacy
!python -m spacy download en_core_web_sm
import nltk
nltk.download('punkt')

In [None]:


# Import libraries
import pandas as pd
import numpy as np
import spacy
from nltk.metrics import edit_distance
from collections import Counter
from math import log2

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your CSV
df = pd.read_csv("sm_data.csv")

# Helper function: Shannon entropy
def shannon_entropy(freqs):
    total = sum(freqs.values())
    return -sum((f / total) * log2(f / total) for f in freqs.values() if f > 0)

# Helper function: Count clauses
def count_clauses(sent):
    return sum(1 for token in sent if token.dep_ in ["ccomp", "advcl", "relcl", "xcomp", "acl", "conj", "parataxis"])

# Main function for syntactic complexity
def syntactic_complexity(text):
    text = str(text)
    doc = nlp(text)
    sentences = list(doc.sents)

    clause_counts = [count_clauses(sent) + 1 for sent in sentences]  # base clause
    clause_bins = [0] * 7
    for count in clause_counts:
        idx = min(count, 7) - 1
        clause_bins[idx] += 1
    total_sents = len(sentences) or 1
    clause_props = [b / total_sents for b in clause_bins]

    pos_tags = [token.pos_ for token in doc if not token.is_punct]
    lemmas = [token.lemma_ for token in doc if not token.is_punct]
    pos_entropy = shannon_entropy(Counter(pos_tags))
    lemma_entropy = shannon_entropy(Counter(lemmas))

    tree_depths = [abs(token.head.i - token.i) for token in doc if token.dep_ != "punct"]
    avg_tree_depth = np.mean(tree_depths) if tree_depths else 0
    avg_clause_length = np.mean(clause_counts) if clause_counts else 0

    # Edit distance using lemma sentences
    lemma_sents = [[token.lemma_ for token in sent if not token.is_punct] for sent in sentences]
    edit_dists = []
    for i in range(len(lemma_sents) - 1):
        s1 = " ".join(lemma_sents[i])
        s2 = " ".join(lemma_sents[i + 1])
        dist = edit_distance(s1, s2) if s1 and s2 else 0
        edit_dists.append(dist)
    avg_edit_distance = np.mean(edit_dists) if edit_dists else 0

    return {
        "prop_clause_1": clause_props[0],
        "prop_clause_2": clause_props[1],
        "prop_clause_3": clause_props[2],
        "prop_clause_4": clause_props[3],
        "prop_clause_5": clause_props[4],
        "prop_clause_6": clause_props[5],
        "prop_clause_7_plus": clause_props[6],
        "avg_sentence_clause_length": avg_clause_length,
        "avg_tree_depth": avg_tree_depth,
        "pos_tag_entropy": pos_entropy,
        "lemma_entropy": lemma_entropy,
        "avg_edit_distance_between_sentences": avg_edit_distance
    }

# Apply to all rows
syntax_features = df['text'].apply(syntactic_complexity)
syntax_df = pd.DataFrame(syntax_features.tolist())

# Merge and export
result = pd.concat([df, syntax_df], axis=1)
result.to_csv("Full_syntactic_features.csv", index=False)
print(" Done! File saved as 'sm_data_syntactic_features.csv'")


✅ Done! File saved as 'sm_data_syntactic_features.csv'


**5.PSYCHOLINGUISTIC FEATURES**

In [None]:
# Install dependencies
!pip install -q spacy
!python -m spacy download en_core_web_sm


In [None]:
import pandas as pd
import requests

# Step 1: Download raw .dct file
url = "https://raw.githubusercontent.com/samzhang111/mrc-psycholinguistics/refs/heads/master/mrc2.dct"
r = requests.get(url)
lines = r.text.splitlines()

# Step 2: Parse fixed-width fields (word and familiarity)
familiarity_data = []
for line in lines:
    word = line[0:20].strip()
    fam = int(line[104:109].strip()) if line[104:109].strip().isdigit() else None
    if fam:
        familiarity_data.append((word.lower(), fam))

# Step 3: Save to CSV
familiarity_df = pd.DataFrame(familiarity_data, columns=["word", "familiarity"])
familiarity_df.to_csv("familiarity.csv", index=False)

print(" familiarity.csv created with", len(familiarity_df), "words")
familiarity_df.head()


✅ familiarity.csv created with 297 words


Unnamed: 0,word,familiarity
0,14135001610906500018,20
1,14155000000000000000,2000
2,14136000000000000000,200
3,14136000000000000000,200
4,15146000000000000000,200


In [None]:
# 📦 Install required library
!pip install -q pandas requests


In [None]:


import pandas as pd
import requests

# Step 1: Download valence-arousal dataset (X-ANEW / Warriner et al.)
url = "https://raw.githubusercontent.com/JULIELab/XANEW/master/Ratings_Warriner_et_al.csv"
r = requests.get(url)
if r.status_code != 200:
    raise Exception(f"Download failed with status code {r.status_code}")

# Step 2: Load into DataFrame
from io import StringIO
df = pd.read_csv(StringIO(r.text))

# We expect columns: Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum
needed = ["Word", "V.Mean.Sum", "A.Mean.Sum"]
for col in needed:
    if col not in df.columns:
        raise Exception(f"Column {col} missing in downloaded file")

# Step 3: Clean & extract just word, valence, arousal
val_df = df[["Word", "V.Mean.Sum", "A.Mean.Sum"]].copy()
val_df.columns = ["word", "valence", "arousal"]
val_df["word"] = val_df["word"].str.lower()

# Step 4: Save as CSV
val_df.to_csv("valence.csv", index=False)
print(f" Created valence.csv with {len(val_df)} entries")

# Preview
val_df.head()


✅ Created valence.csv with 13915 entries


Unnamed: 0,word,valence,arousal
0,aardvark,6.26,2.41
1,abalone,5.3,2.65
2,abandon,2.84,3.73
3,abandonment,2.63,4.95
4,abbey,5.85,2.2


In [None]:
# Install dependencies
!pip install -q spacy
!python -m spacy download en_core_web_sm



In [None]:

# Imports
import pandas as pd
import numpy as np
import spacy
from functools import reduce

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")

# Load tweet dataset
df = pd.read_csv("sm_data.csv")  # assumes column is named 'text'

# Load psycholinguistic norm datasets
concreteness_df = pd.read_csv("concreteness.csv").rename(columns={"word": "Word", "conc_mean": "Concreteness"})
imageability_df = pd.read_csv("imageability.csv").rename(columns={"word": "Word", "imgability": "Imageability"})
familiarity_df = pd.read_csv("familiarity.csv").rename(columns={"word": "Word", "familiarity": "Familiarity"})
aoa_df = pd.read_csv("aoa.csv").rename(columns={"word": "Word", "aoa": "AoA"})
valence_df = pd.read_csv("valence.csv").rename(columns={"word": "Word"})

# Ensure all 'Word' columns are lowercase strings
for df_norm in [concreteness_df, imageability_df, familiarity_df, aoa_df, valence_df]:
    df_norm["Word"] = df_norm["Word"].astype(str).str.lower()

# Merge all norms into one DataFrame
norms = reduce(lambda left, right: pd.merge(left, right, on="Word", how="outer"),
               [concreteness_df, imageability_df, familiarity_df, aoa_df, valence_df])

# Helper to compute feature statistics
def psych_stats(values):
    return {
        "mean": np.mean(values) if values else 0,
        "min": np.min(values) if values else 0,
        "max": np.max(values) if values else 0,
        "std": np.std(values) if values else 0,
        "median": np.median(values) if values else 0,
    }

# Main extractor for a single tweet
def extract_psycholinguistics(text):
    doc = nlp(str(text))
    words = [token.text.lower() for token in doc if token.is_alpha]
    word_data = norms[norms["Word"].isin(words)]

    result = {}
    for feature in ["Concreteness", "Imageability", "Familiarity", "AoA", "valence", "arousal"]:
        stats = psych_stats(word_data[feature].dropna().tolist())
        for k, v in stats.items():
            result[f"{feature.lower()}_{k}"] = v
    return result

# Apply feature extractor to each tweet
psych_features = df["text"].apply(extract_psycholinguistics)
psych_df = pd.DataFrame(psych_features.tolist())

# Combine and export enriched dataset
result = pd.concat([df, psych_df], axis=1)
result.to_csv("Full_psycholinguistic_features.csv", index=False)

print("✅ Done! 30 psycholinguistic features saved to 'sm_data_psycholinguistic_features.csv'")


✅ Done! 30 psycholinguistic features saved to 'sm_data_psycholinguistic_features.csv'


**6.WORD_INFORMATION_FEATURES**

In [None]:
import pandas as pd
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("sm_data.csv")  # assumes column name is 'text'

# Feature extractor
def extract_word_info(text):
    doc = nlp(str(text))
    tokens = [token for token in doc if token.is_alpha]
    total_tokens = len(tokens)
    total_chars = sum(len(token.text) for token in tokens)
    stopwords = sum(1 for token in tokens if token.is_stop)
    puncts = sum(1 for token in doc if token.is_punct)

    pos_counts = {
        "noun": 0, "verb": 0, "adj": 0, "adv": 0, "pron": 0,
        "propn": 0, "intj": 0, "conj": 0, "part": 0, "sym": 0, "num": 0,
        "content_words": 0, "function_words": 0
    }

    for token in tokens:
        pos = token.pos_.lower()
        if pos in pos_counts:
            pos_counts[pos] += 1
        if pos in ["noun", "verb", "adj", "adv"]:
            pos_counts["content_words"] += 1
        else:
            pos_counts["function_words"] += 1

    result = {
        "token_count": total_tokens,
        "char_count": total_chars,
        "avg_word_length": total_chars / total_tokens if total_tokens > 0 else 0,
        "stopword_ratio": stopwords / total_tokens if total_tokens > 0 else 0,
        "punctuation_count": puncts,
        "punctuation_ratio": puncts / len(doc) if len(doc) > 0 else 0,
        "avg_sentence_length": sum(len(sent) for sent in doc.sents) / len(list(doc.sents)) if len(list(doc.sents)) > 0 else 0,
        "lexical_density": pos_counts["content_words"] / total_tokens if total_tokens > 0 else 0,
    }

    for tag, count in pos_counts.items():
        result[f"num_{tag}"] = count
        result[f"{tag}_ratio"] = count / total_tokens if total_tokens > 0 else 0

    return result

# Apply extraction
word_info_features = df["text"].apply(extract_word_info)
word_info_df = pd.DataFrame(word_info_features.tolist())

# Save result
result = pd.concat([df, word_info_df], axis=1)
result.to_csv("Full_word_information_features.csv", index=False)

print("✅ Done! All 24 word-level features saved to 'sm_data_word_information_features.csv'")


✅ Done! All 24 word-level features saved to 'sm_data_word_information_features.csv'


**7.Referential cohesion features**

In [None]:
import pandas as pd
import spacy
from nltk.stem import PorterStemmer
from collections import defaultdict


# Load resources
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

# Load dataset
df = pd.read_csv("sm_data.csv")  # must contain 'text' column

# Helper: extract nouns, arguments, stems, content words, anaphors
def extract_features_from_sent(sent):
    tokens = [t for t in sent if t.is_alpha]
    nouns = set([t.lemma_.lower() for t in tokens if t.pos_ == "NOUN"])
    arguments = set([t.lemma_.lower() for t in tokens if t.dep_ in ["nsubj", "dobj", "pobj"]])
    stems = set([stemmer.stem(t.text.lower()) for t in tokens])
    content_words = set([t.lemma_.lower() for t in tokens if t.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]])
    anaphors = set([t.lemma_.lower() for t in tokens if t.pos_ == "PRON"])
    all_words = set([t.lemma_.lower() for t in tokens])
    return nouns, arguments, stems, content_words, anaphors, all_words

# Overlap calculator
def compute_overlap(curr, prev):
    return len(curr & prev) / len(curr | prev) if curr | prev else 0

# Main extractor for referential cohesion
def extract_referential_cohesion(text):
    doc = nlp(str(text))
    sents = list(doc.sents)

    # Lists of feature sets per sentence
    fsets = [extract_features_from_sent(sent) for sent in sents]

    local_scores = defaultdict(list)
    global_scores = defaultdict(list)

    for i in range(1, len(fsets)):
        for name, idx in zip(["noun", "arg", "stem", "content", "anaphor", "all"],
                             range(6)):
            curr = fsets[i][idx]
            prev = fsets[i - 1][idx]
            # local overlap
            local_scores[f"{name}_local"].append(compute_overlap(curr, prev))

            # global overlap (vs all previous)
            global_union = set().union(*[fsets[j][idx] for j in range(i)])
            global_scores[f"{name}_global"].append(compute_overlap(curr, global_union))

    # Compute mean overlaps
    result = {}
    for name in ["noun", "arg", "stem", "content", "anaphor", "all"]:
        result[f"{name}_overlap_local"] = sum(local_scores[f"{name}_local"]) / len(local_scores[f"{name}_local"]) if local_scores[f"{name}_local"] else 0
        result[f"{name}_overlap_global"] = sum(global_scores[f"{name}_global"]) / len(global_scores[f"{name}_global"]) if global_scores[f"{name}_global"] else 0

    return result

# Apply to dataset
cohesion_features = df["text"].apply(extract_referential_cohesion)
cohesion_df = pd.DataFrame(cohesion_features.tolist())

# Combine and save
final_df = pd.concat([df, cohesion_df], axis=1)
final_df.to_csv("Full_referential_cohesion.csv", index=False)

print("✅ Referential cohesion features (12) saved to 'sm_data_referential_cohesion.csv'")


✅ Referential cohesion features (12) saved to 'sm_data_referential_cohesion.csv'


**8.Textual simplicity features**

In [None]:

import pandas as pd
import spacy

# Load model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("sm_data.csv")  # must contain 'text' column

# Simplicity extractor
def extract_textual_simplicity(text):
    doc = nlp(str(text))
    sentences = list(doc.sents)
    total = len(sentences)
    short = medium = long = very_long = 0

    for sent in sentences:
        length = len([token for token in sent if token.is_alpha or token.is_digit])
        if length <= 10:
            short += 1
        elif 11 <= length <= 20:
            medium += 1
        elif 21 <= length <= 30:
            long += 1
        else:
            very_long += 1

    return {
        "short_sent_ratio": short / total if total else 0,
        "medium_sent_ratio": medium / total if total else 0,
        "long_sent_ratio": long / total if total else 0,
        "very_long_sent_ratio": very_long / total if total else 0,
    }

# Apply to dataset
simplicity_features = df["text"].apply(extract_textual_simplicity)
simplicity_df = pd.DataFrame(simplicity_features.tolist())

# Combine and save
final_df = pd.concat([df, simplicity_df], axis=1)
final_df.to_csv("Full_textual_simplicity.csv", index=False)

print("✅ Textual simplicity features (4) saved to 'sm_data_textual_simplicity.csv'")


✅ Textual simplicity features (4) saved to 'sm_data_textual_simplicity.csv'


**9.Semantic cohesion features**

In [None]:
# Install required packages
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:

import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load medium spaCy model (has word vectors)
nlp = spacy.load("en_core_web_md")

# Load dataset
df = pd.read_csv("sm_data.csv")  # ensure 'text' column exists

# Utility: average similarity between consecutive items
def pairwise_similarity(vectors):
    if len(vectors) < 2:
        return 0.0, 0.0
    sims = [cosine_similarity([vectors[i]], [vectors[i+1]])[0][0] for i in range(len(vectors)-1)]
    return np.mean(sims), np.std(sims)

# Semantic Cohesion extractor
def extract_semantic_cohesion(text):
    doc = nlp(str(text))

    # Sentences
    sents = [sent.text.strip() for sent in doc.sents if len(sent) > 3]
    sent_vecs = [nlp(sent).vector for sent in sents if nlp(sent).has_vector]

    # All sentence pair similarity
    all_sims = []
    for i in range(len(sent_vecs)):
        for j in range(i + 1, len(sent_vecs)):
            sim = cosine_similarity([sent_vecs[i]], [sent_vecs[j]])[0][0]
            all_sims.append(sim)

    # Paragraphs
    paras = [p.strip() for p in text.split("\n") if len(p.strip().split()) > 3]
    para_vecs = [nlp(p).vector for p in paras if nlp(p).has_vector]

    # Pairwise sentence and paragraph similarities
    sent_adj_mean, sent_adj_std = pairwise_similarity(sent_vecs)
    para_adj_mean, para_adj_std = pairwise_similarity(para_vecs)

    result = {
        "lsa_sent_adj_mean": sent_adj_mean,
        "lsa_sent_adj_std": sent_adj_std,
        "lsa_sent_all_mean": np.mean(all_sims) if all_sims else 0,
        "lsa_sent_all_std": np.std(all_sims) if all_sims else 0,
        "lsa_para_adj_mean": para_adj_mean,
        "lsa_para_adj_std": para_adj_std,
        "lsa_first_last_para_similarity": cosine_similarity([para_vecs[0]], [para_vecs[-1]])[0][0] if len(para_vecs) > 1 else 0,
        "lsa_first_last_sent_similarity": cosine_similarity([sent_vecs[0]], [sent_vecs[-1]])[0][0] if len(sent_vecs) > 1 else 0,
    }

    return result

# Apply to all texts
semantic_features = df["text"].apply(extract_semantic_cohesion)
semantic_df = pd.DataFrame(semantic_features.tolist())

# Combine and export
final_df = pd.concat([df, semantic_df], axis=1)
final_df.to_csv("Full_semantic_cohesion.csv", index=False)
print("Semantic cohesion features (8) saved to 'sm_data_semantic_cohesion.csv'")


✅ Semantic cohesion features (8) saved to 'sm_data_semantic_cohesion.csv'


**10.Word frequency features**

In [None]:
import pandas as pd
import numpy as np
import spacy
from wordfreq import zipf_frequency


# Load spaCy model
nlp = spacy.load("en_core_web_sm")
df = pd.read_csv("sm_data.csv")

def extract_word_frequency(text):
    doc = nlp(str(text))
    tokens = [token for token in doc if token.is_alpha and not token.is_stop]

    freqs = []
    pos_freqs = {'NOUN': [], 'VERB': [], 'ADJ': [], 'ADV': []}
    rare_counts = {'NOUN': 0, 'VERB': 0, 'ADJ': 0, 'ADV': 0}

    for token in tokens:
        word = token.text.lower()
        freq = zipf_frequency(word, 'en')
        freqs.append(freq)

        if token.pos_ in pos_freqs:
            pos_freqs[token.pos_].append(freq)
            if freq < 2.0:
                rare_counts[token.pos_] += 1

    rare_total = sum(1 for f in freqs if f < 2.0)
    content_freqs = pos_freqs['NOUN'] + pos_freqs['VERB'] + pos_freqs['ADJ'] + pos_freqs['ADV']
    rare_content = sum(rare_counts.values())

    result = {
        "mean_zipf": np.mean(freqs) if freqs else 0,
        "rare_word_count": rare_total,
        "rare_noun_count": rare_counts['NOUN'],
        "rare_verb_count": rare_counts['VERB'],
        "rare_adj_count": rare_counts['ADJ'],
        "rare_adv_count": rare_counts['ADV'],
        "rare_content_word_count": rare_content,
        "content_zipf_mean": np.mean(content_freqs) if content_freqs else 0,
        "noun_zipf_mean": np.mean(pos_freqs['NOUN']) if pos_freqs['NOUN'] else 0,
        "verb_zipf_mean": np.mean(pos_freqs['VERB']) if pos_freqs['VERB'] else 0,
        "adj_zipf_mean": np.mean(pos_freqs['ADJ']) if pos_freqs['ADJ'] else 0,
        "adv_zipf_mean": np.mean(pos_freqs['ADV']) if pos_freqs['ADV'] else 0,
        "word_count": len(tokens),
        "content_word_count": len(content_freqs),
        "rare_ratio": rare_total / len(tokens) if tokens else 0
    }

    return result

word_freq_features = df["text"].apply(extract_word_frequency)
word_freq_df = pd.DataFrame(word_freq_features.tolist())

# Combine and save
result = pd.concat([df, word_freq_df], axis=1)
result.to_csv("Full_word_frequency_features.csv", index=False)

print("✅ Word frequency features (16) saved to 'sm_data_word_frequency_features.csv'")


✅ Word frequency features (16) saved to 'sm_data_word_frequency_features.csv'


**11.Syntactic Pattern Density**

In [None]:
import pandas as pd
import spacy

# Load model and data
nlp = spacy.load("en_core_web_sm")
df = pd.read_csv("sm_data.csv")

# List of subordinating conjunctions (non-exhaustive)
sub_conjs = {"because", "although", "since", "though", "if", "when", "while", "unless", "whereas"}

# Syntactic feature extractor
def extract_syntactic_density(text):
    doc = nlp(str(text))
    sents = list(doc.sents)
    num_sents = len(sents)
    num_tokens = len([t for t in doc if not t.is_space])

    noun_phrases = len(list(doc.noun_chunks))
    verb_phrases = sum(1 for token in doc if token.pos_ == "VERB")
    negations = sum(1 for token in doc if token.lower_ in {"not", "n't", "never", "no"} or token.dep_ == "neg")
    coord_conj = sum(1 for token in doc if token.dep_ == "cc")
    subord_conj = sum(1 for token in doc if token.text.lower() in sub_conjs)
    prepositions = sum(1 for token in doc if token.pos_ == "ADP")
    relative_clauses = sum(1 for token in doc if token.dep_ == "relcl")
    auxiliaries = sum(1 for token in doc if token.dep_ == "aux")
    adj_clauses = sum(1 for token in doc if token.dep_ == "acl")
    adv_clauses = sum(1 for token in doc if token.dep_ == "advcl")
    appositives = sum(1 for token in doc if token.dep_ == "appos")

    return {
        "sentence_count": num_sents,
        "token_count": num_tokens,
        "avg_sentence_length": num_tokens / num_sents if num_sents > 0 else 0,
        "noun_phrase_density": noun_phrases / num_sents if num_sents else 0,
        "verb_phrase_density": verb_phrases / num_sents if num_sents else 0,
        "negation_density": negations / num_sents if num_sents else 0,
        "coord_conj_density": coord_conj / num_sents if num_sents else 0,
        "subord_conj_density": subord_conj / num_sents if num_sents else 0,
        "preposition_density": prepositions / num_sents if num_sents else 0,
        "relative_clause_density": relative_clauses / num_sents if num_sents else 0,
        "auxiliary_density": auxiliaries / num_sents if num_sents else 0,
        "adjective_clause_density": adj_clauses / num_sents if num_sents else 0,
        "adverbial_clause_density": adv_clauses / num_sents if num_sents else 0,
        "appositive_density": appositives / num_sents if num_sents else 0
    }

# Apply to data
syntactic_features = df["text"].apply(extract_syntactic_density)
syntactic_df = pd.DataFrame(syntactic_features.tolist())

# Combine and export
final_df = pd.concat([df, syntactic_df], axis=1)
final_df.to_csv("Full_syntactic_density_features.csv", index=False)

print("✅ Syntactic Pattern Density (14 features) saved to 'sm_data_syntactic_density_features.csv'")


✅ Syntactic Pattern Density (14 features) saved to 'sm_data_syntactic_density_features.csv'


**12.Connective features**

In [None]:
import pandas as pd
import spacy

# Load spaCy

nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("sm_data.csv")

# Define connective categories
connectives = {
    "causal": {"because", "since", "as", "so"},
    "logical": {"if", "then", "therefore", "thus"},
    "adversative": {"but", "however", "although", "though", "nevertheless", "nonetheless"},
    "temporal": {"before", "after", "when", "while", "then"},
    "additive": {"and", "also", "moreover", "in addition", "furthermore"}
}

# Extract function
def extract_connectives(text):
    doc = nlp(str(text))
    words = [token.text.lower() for token in doc if token.is_alpha]

    counts = {k: 0 for k in connectives}

    for word in words:
        for conn_type, conn_words in connectives.items():
            if word in conn_words:
                counts[conn_type] += 1

    total = sum(counts.values())
    result = {
        "causal_connectives": counts["causal"],
        "logical_connectives": counts["logical"],
        "adversative_connectives": counts["adversative"],
        "temporal_connectives": counts["temporal"],
        "additive_connectives": counts["additive"],
        "all_connectives": total
    }

    return result

# Apply to dataset
connective_features = df["text"].apply(extract_connectives)
connective_df = pd.DataFrame(connective_features.tolist())

# Combine and save
final_df = pd.concat([df, connective_df], axis=1)
final_df.to_csv("Full_connective_features.csv", index=False)

print("Connective features (6) saved to 'sm_data_connective_features.csv'")


✅ Connective features (6) saved to 'sm_data_connective_features.csv'


**EMNLP-STYLE COMPLEXITY FEATURES**

In [None]:
# 📦 Install required packages
!pip install -q spacy textstat syllapy scikit-learn
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import numpy as np
import spacy
import syllapy
from textstat import dale_chall_readability_score
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from math import log2

#  Load model and data
nlp = spacy.load("en_core_web_sm")
df = pd.read_csv("sm_data.csv")  # Ensure 'text' column exists

# Precompute TF-IDF for all texts
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].fillna(""))
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Main extractor
def extract_emnlp_complexity(text):
    doc = nlp(str(text))
    words = [token for token in doc if token.is_alpha]
    tokens = [token.text.lower() for token in words]
    word_count = len(words)

    # === Lexical Complexity ===
    avg_word_len = np.mean([len(w) for w in tokens]) if tokens else 0
    poly_count = sum(1 for w in tokens if syllapy.count(w) >= 3)
    poly_ratio = poly_count / word_count if word_count else 0

    ttr = len(set(tokens)) / word_count if word_count else 0
    connectives = {"because", "since", "so", "if", "then", "but", "although", "and", "however"}
    avg_connectives = sum(1 for w in tokens if w in connectives) / word_count if word_count else 0

    unique_entities = len(set(ent.text.lower() for ent in doc.ents))  # Named entities

    # DALE Frequency
    dale_score = dale_chall_readability_score(text)

    # TF-IDF sum
    tfidf_score = sum(idf_scores.get(w, 0) for w in tokens) / word_count if word_count else 0

    # Log-likelihood ratio (placeholder with frequency)
    word_freq = Counter(tokens)
    lls = 0
    for w, f in word_freq.items():
        p = f / word_count
        lls += f * log2(p) if p > 0 else 0

    # === Syntactic Complexity ===
    dep_lengths = []
    idt = 0
    le = 0
    nested_noun_dists = []

    for sent in doc.sents:
        heads = [token.head.i for token in sent if token.dep_ != "ROOT"]
        deps = [token.i for token in sent if token.dep_ != "ROOT"]

        dep_lengths.extend([abs(d - h) for d, h in zip(deps, heads)])

        if not heads or len(heads) != len(deps):
            idt += 1  # Incomplete dependency tree

        # Left-embeddedness
        for token in sent:
            if token.head.i < token.i:
                le += 1

        # Nested noun phrases
        np_indices = [token.i for token in sent if token.pos_ == "NOUN"]
        if len(np_indices) > 1:
            nested_noun_dists.append(np.std(np.diff(np_indices)))

    return {
        "avg_word_length": avg_word_len,
        "polysyllable_ratio": poly_ratio,
        "dale_score": dale_score,
        "type_token_ratio": ttr,
        "connective_ratio": avg_connectives,
        "unique_entities": unique_entities,
        "avg_tfidf_score": tfidf_score,
        "log_likelihood_score": lls,
        "IDT_incomplete_deps": idt,
        "DLT_avg_dependency_distance": np.mean(dep_lengths) if dep_lengths else 0,
        "LE_left_embedding": le,
        "NND_nested_noun_std": np.mean(nested_noun_dists) if nested_noun_dists else 0
    }

# 🔁 Apply
feature_rows = df["text"].apply(extract_emnlp_complexity)
feature_df = pd.DataFrame(feature_rows.tolist())

# ➕ Save
final_df = pd.concat([df, feature_df], axis=1)
final_df.to_csv("Full_emnlp_complexity_features.csv", index=False)

print("EMNLP-style complexity features saved to 'sm_data_emnlp_complexity_features.csv'")


✅ EMNLP-style complexity features saved to 'sm_data_emnlp_complexity_features.csv'


In [None]:
# Install dependencies
!pip install -q spacy fasttext
#!python -m spacy download en_core_web_sm


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install fasttext




In [None]:
# Install required packages
!pip install -q fasttext
!pip install -q spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# ⬇️ MUST RUN FIRST: Fix fasttext + numpy compatibility
!pip install -q numpy==1.24.4 fasttext spacy
!python -m spacy download en_core_web_sm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m108.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m112.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
blosc2 3.5.1 requires numpy>=1.26, but you have numpy 1.24.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.4 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
xarray-einstats 0.9.1 requires numpy>=1.25, but you have numpy 1.24.4 which is incompatible.
pymc 5.23.0 requi

In [None]:
# Imports
import pandas as pd
import numpy as np
import spacy
import fasttext
import urllib.request
import os
from collections import Counter

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load your tweet dataset (must contain a column named 'text')
df = pd.read_csv("sm_data.csv")
df.columns = df.columns.str.strip().str.lower()  # Normalize column names
assert 'text' in df.columns, "The dataset must contain a 'text' column."

# Download fastText language identification model
ft_model_path = "lid.176.bin"
if not os.path.exists(ft_model_path):
    urllib.request.urlretrieve("https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", ft_model_path)
ft_model = fasttext.load_model(ft_model_path)

# Detect language for each token using fastText
def detect_langs(tokens):
    langs = []
    for token in tokens:
        if token.strip():
            label, _ = ft_model.predict(token)
            lang = label[0].replace("__label__", "")
            langs.append(lang)
    return langs

# Compute Code-Mixing Metrics
def code_mixing_metrics(text):
    doc = nlp(str(text))
    tokens = [token.text for token in doc if token.is_alpha]
    if not tokens:
        return dict.fromkeys(['CMI', 'M_index', 'I_index', 'Burstiness', 'Memory'], 0.0)

    langs = detect_langs(tokens)
    lang_counts = Counter(langs)
    total = sum(lang_counts.values())

    max_lang = max(lang_counts.values())
    cmi = (total - max_lang) / total if total else 0

    probs = [c / total for c in lang_counts.values()]
    m_index = 1 - sum(p ** 2 for p in probs) if total > 1 else 0

    switches = sum(1 for i in range(1, len(langs)) if langs[i] != langs[i - 1])
    i_index = switches / (len(langs) - 1) if len(langs) > 1 else 0

    bursts = []
    prev = langs[0]
    count = 1
    for lang in langs[1:]:
        if lang == prev:
            count += 1
        else:
            bursts.append(count)
            count = 1
            prev = lang
    bursts.append(count)
    burstiness = np.std(bursts) / np.mean(bursts) if len(bursts) > 1 and np.mean(bursts) > 0 else 0

    memory = sum(1 for i in range(1, len(langs)) if langs[i] == langs[i - 1]) / (len(langs) - 1) if len(langs) > 1 else 0

    return {
        "CMI": round(cmi, 4),
        "M_index": round(m_index, 4),
        "I_index": round(i_index, 4),
        "Burstiness": round(burstiness, 4),
        "Memory": round(memory, 4)
    }

# Apply to all texts
code_mixed = df["text"].apply(code_mixing_metrics)
metrics_df = pd.DataFrame(code_mixed.tolist())

# Save combined result
result = pd.concat([df, metrics_df], axis=1)
result.to_csv("Full_code_mixed_complexity.csv", index=False)

# Done
print("Code-mixed complexity metrics saved to 'sm_data_code_mixed_complexity.csv'")


✅ Code-mixed complexity metrics saved to 'sm_data_code_mixed_complexity.csv'


In [None]:
!pip install -q spacy

In [None]:
#Install required packages

import pandas as pd
import numpy as np
import spacy
import gzip
import io
from collections import Counter

# Load SpaCy model
import en_core_web_sm
nlp = en_core_web_sm.load()

#Load your social media dataset (ensure it has a 'text' column)
df = pd.read_csv("sm_data.csv")
df.columns = df.columns.str.strip().str.lower()
assert 'text' in df.columns, "Dataset must have a 'text' column."

# Yule's K-complexity function
def yules_k(text):
    doc = nlp(str(text).lower())
    words = [token.text for token in doc if token.is_alpha]
    if not words:
        return 0
    freq = Counter(words)
    N = sum(freq.values())
    freqs_of_freqs = Counter(freq.values())
    M = sum(f * f_count for f, f_count in freqs_of_freqs.items())
    K = (10_000 * (M - N)) / (N * N) if N > 0 else 0
    return round(K, 4)

# Gzip compression complexity
def gzip_ratio(text):
    raw = text.encode('utf-8')
    if not raw:
        return 0.0
    with io.BytesIO() as bio:
        with gzip.GzipFile(fileobj=bio, mode='w') as f:
            f.write(raw)
        compressed = bio.getvalue()
    ratio = len(compressed) / len(raw) if len(raw) > 0 else 0
    return round(ratio, 4)

# Apply to dataset
df["yules_k"] = df["text"].apply(yules_k)
df["gzip_complexity"] = df["text"].apply(gzip_ratio)

# Save output
df.to_csv("Full_comment_complexity.csv", index=False)
print("Complexity metrics (Yule's K and gzip) saved to 'sm_data_comment_complexity.csv'")


✅ Complexity metrics (Yule's K and gzip) saved to 'sm_data_comment_complexity.csv'
