# Feature Engineering

In [1]:
import os
import re
import json
import string

import spacy
import textstat
import numpy as np
import pandas as pd
from datasets import load_dataset

import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
np.random.seed(691)

In [3]:
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arabi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\arabi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
nlp = spacy.load("en_core_web_sm")

### Load Dataset

- HuggingFace: https://huggingface.co/datasets/ucsbnlp/liar
- Kaggle: https://www.kaggle.com/datasets/doanquanvietnamca/liar-dataset

In [5]:
# Loading dataset from 'HuggingFace'

# dataset = load_dataset('ucsbnlp/liar', trust_remote_code=True)

# train = dataset['train'].to_pandas()
# valid = dataset['validation'].to_pandas()
# test = dataset['test'].to_pandas()

In [6]:
# Loading dataset from 'Kaggle'

columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

train = pd.read_csv('data/raw/train.tsv', sep='\t', header=None)
validation = pd.read_csv('data/raw/valid.tsv', sep='\t', header=None)
test = pd.read_csv('data/raw/test.tsv', sep='\t', header=None)

train.columns = columns
validation.columns = columns
test.columns = columns

train.head(2)

Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.


In [7]:
print(f'{train.shape = }')
print(f'{validation.shape = }')
print(f'{test.shape = }')

train.shape = (10240, 14)
valid.shape = (1284, 14)
test.shape = (1267, 14)


### Removing Columns

In [8]:
train.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'job_title',
       'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
       'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts',
       'context'],
      dtype='object')

In [9]:
required_Columns = ['label', 'statement', 'subject', 'speaker', 'party_affiliation', 'context']

train = train[required_Columns]
validation = validation[required_Columns]
test = test[required_Columns]

print(f'{train.shape = }')
print(f'{validation.shape = }')
print(f'{test.shape = }')

train.shape = (10240, 6)
valid.shape = (1284, 6)
test.shape = (1267, 6)


In [10]:
train.head()

Unnamed: 0,label,statement,subject,speaker,party_affiliation,context
0,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,republican,a mailer
1,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,democrat,a floor speech.
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,democrat,Denver
3,false,Health care reform legislation is likely to ma...,health-care,blog-posting,none,a news release
4,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,democrat,an interview on CNN


In [11]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   label              10240 non-null  object
 1   statement          10240 non-null  object
 2   subject            10238 non-null  object
 3   speaker            10238 non-null  object
 4   party_affiliation  10238 non-null  object
 5   context            10138 non-null  object
dtypes: object(6)
memory usage: 5.1 MB


### Fixing `N/A` values

In [12]:
print("Null Value Counts:")

pd.DataFrame({
    "train": train.isnull().sum(),
    "validation": validation.isnull().sum(),
    "test": test.isnull().sum()
})

Null Value Counts:


Unnamed: 0,train,validation,test
label,0,0,0
statement,0,0,0
subject,2,0,0
speaker,2,0,0
party_affiliation,2,0,0
context,102,12,17


In [13]:
for df in [train, validation, test]:
    df.fillna('unknown', inplace=True)

print("Null Value Counts:")

pd.DataFrame({
    "train": train.isnull().sum(),
    "validation": validation.isnull().sum(),
    "test": test.isnull().sum()
})

Null Value Counts:


Unnamed: 0,train,validation,test
label,0,0,0
statement,0,0,0
subject,0,0,0
speaker,0,0,0
party_affiliation,0,0,0
context,0,0,0


### Cleaning `statement` and adding features

In [None]:
stop_words = set(stopwords.words('english'))

def clean_statement(text):
    text = text.lower()
    text.replace('-', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

for df in [train, validation, test]:
    df['statement_clean'] = df['statement'].apply(clean_statement)

In [15]:
for i in np.random.randint(0, 100, 3):
    print(train['statement'][i])
    print(train['statement_clean'][i], end='\n\n')

If you take the population growth here in Virginia, we are net zero on job creation since (Bob McDonnell) became governor.
take population growth virginia net zero job creation since bob mcdonnell became governor

Water rates in Manila, Philippines, were raised up to 845 percent when a subsidiary of the World Bank became a partial owner.
water rates manila philippines raised 845 percent subsidiary world bank became partial owner

African-American youth unemployment is 51 percent. Hispanic youth unemployment is 36 percent.
africanamerican youth unemployment 51 percent hispanic youth unemployment 36 percent



#### Readability Score

In [None]:
def compute_readability(text):
    return textstat.flesch_kincaid_grade(text)

for df in [train, validation, test]:
    df['readability_score'] = df['statement'].apply(compute_readability)

#### Number of Named Entities + Entity Types Present

In [None]:
def ner_features(text):
    doc = nlp(text)
    num_entities = len(doc.ents)
    entity_types = len(set([ent.label_ for ent in doc.ents]))
    return num_entities, entity_types

for df in [train, validation, test]:
    df[['num_named_entities', 'entity_types_present']] = df['statement'].apply(
        lambda x: pd.Series(ner_features(x))
    )

#### Has Citation Words

In [None]:
citation_phrases = [
    "according to", "reported by", "study shows", "research shows", "as per",
    "sources say", "claims that", "according", "accordingto"
]

def has_citation(text):
    text_lower = text.lower()
    return int(any(phrase in text_lower for phrase in citation_phrases))

for df in [train, validation, test]:
    df['has_citation_words'] = df['statement'].apply(has_citation)

#### Sentiment Analysis

In [None]:
sia = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sia.polarity_scores(text)['compound']

for df in [train, validation, test]:
    df['sentiment_score'] = df['statement_clean'].apply(get_sentiment_score)

#### TF-IDF Score Sum

In [None]:
# Fit TF-IDF on train['statement']
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(train['statement_clean'])

# Function to compute TF-IDF sum
def tfidf_sum(row_vector):
    return row_vector.sum()

# Compute for train
train['tfidf_score_sum'] = [tfidf_sum(tfidf_train[i]) for i in range(tfidf_train.shape[0])]

# Now apply same vectorizer to val/test
tfidf_valid = tfidf_vectorizer.transform(validation['statement_clean'])
validation['tfidf_score_sum'] = [tfidf_sum(tfidf_valid[i]) for i in range(tfidf_valid.shape[0])]

tfidf_test = tfidf_vectorizer.transform(test['statement_clean'])
test['tfidf_score_sum'] = [tfidf_sum(tfidf_test[i]) for i in range(tfidf_test.shape[0])]

#### Polarity Shift Words Count

In [None]:
polarity_shift_words = ["however", "but", "despite", "although", "yet", "though", "still", "nevertheless"]

def polarity_shift_count(text):
    tokens = text.split()
    return sum(1 for token in tokens if token in polarity_shift_words)

for df in [train, validation, test]:
    df['polarity_shift_words_count'] = df['statement_clean'].apply(polarity_shift_count)

#### Uncertainty Words Count

In [None]:
uncertainty_words = ["might", "could", "possibly", "unclear", "may", "suggests", "apparently", "potentially"]

def uncertainty_count(text):
    tokens = text.split()
    return sum(1 for token in tokens if token in uncertainty_words)

for df in [train, validation, test]:
    df['uncertainty_words_count'] = df['statement_clean'].apply(uncertainty_count)

#### Negation Words Count

In [None]:
negation_words = ["not", "never", "no", "none", "nobody", "nothing", "nowhere", "neither", "nor", "cannot"]

def negation_count(text):
    tokens = text.split()
    return sum(1 for token in tokens if token in negation_words)

for df in [train, validation, test]:
    df['negation_words_count'] = df['statement_clean'].apply(negation_count)

#### Removing `statement` and renaming `statement_clean` to `statement`

In [None]:
for df in [train, validation, test]:
    df.drop(columns=['statement'], inplace=True)
    df.rename(columns={'statement_clean': 'statement'}, inplace=True)

### Converting other column text to lowercase

In [None]:
for col in train.select_dtypes(include='object').columns:
    if col in ['label', 'statement']:
        continue
    train[col] = train[col].str.lower()
    validation[col] = validation[col].str.lower()
    test[col] = test[col].str.lower()

### Adding `subject_count`

In [None]:
for df in [train, validation, test]:
    df['subject_count'] = df['subject'].apply(lambda x: len(str(x).split(',')))

### Adding `label_id` column

In [None]:
label_to_int = json.load(open('utils/lable_mapping.json', 'rb'))

for df in [train, validation, test]:
    df['label_id'] = df['label'].map(label_to_int)

### Rearranging Columns

In [None]:
final_columns = [
    'statement',
    'subject',
    'speaker',
    'party_affiliation',
    'context',
    'subject_count',
    'readability_score',
    'num_named_entities',
    'entity_types_present',
    'has_citation_words',
    'polarity_shift_words_count',
    'uncertainty_words_count',
    'negation_words_count',
    'tfidf_score_sum',
    'sentiment_score',
    'label',
    'label_id'
]

train = train[final_columns]
validation = validation[final_columns]
test = test[final_columns]

In [29]:
train.head()

Unnamed: 0,statement,subject,speaker,party_affiliation,context,subject_count,readability_score,num_named_entities,entity_types_present,has_citation_words,polarity_shift_words_count,uncertainty_words_count,negation_words_count,tfidf_score_sum,sentiment_score,label,label_id
0,says annies list political group supports thir...,abortion,dwayne-bohac,republican,a mailer,1,9.081818,1,1,0,0,0,0,2.886169,0.25,false,1
1,decline coal start started natural gas took st...,"energy,history,job-accomplishments",scott-surovell,democrat,a floor speech.,3,7.773333,1,1,0,0,0,0,3.305099,0.3612,half-true,3
2,hillary clinton agrees john mccain voting give...,foreign-policy,barack-obama,democrat,denver,1,9.209474,4,2,0,0,0,0,3.394159,0.3182,mostly-true,4
3,health care reform legislation likely mandate ...,health-care,blog-posting,none,a news release,1,8.756667,0,0,0,0,0,0,3.089276,0.7579,false,1
4,economic turnaround started end term,"economy,jobs",charlie-crist,democrat,an interview on cnn,2,7.19,0,0,0,0,0,0,2.207947,0.0,half-true,3


In [30]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   statement                   10240 non-null  object 
 1   subject                     10240 non-null  object 
 2   speaker                     10240 non-null  object 
 3   party_affiliation           10240 non-null  object 
 4   context                     10240 non-null  object 
 5   subject_count               10240 non-null  int64  
 6   readability_score           10240 non-null  float64
 7   num_named_entities          10240 non-null  int64  
 8   entity_types_present        10240 non-null  int64  
 9   has_citation_words          10240 non-null  int64  
 10  polarity_shift_words_count  10240 non-null  int64  
 11  uncertainty_words_count     10240 non-null  int64  
 12  negation_words_count        10240 non-null  int64  
 13  tfidf_score_sum             102

### Saving Data

In [None]:
os.makedirs('data', exist_ok=True)

train.to_csv('data/train.csv', index=False)
validation.to_csv('data/validation.csv', index=False)
test.to_csv('data/test.csv', index=False)

---