In [16]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as split

In [17]:
# Path to the JSON file
file_path ="../data/headline_data/headlines_syntactic_features.csv"

# Reading the JSON file into a DataFrame

data = pd.read_csv(file_path)
data.head()  # Display the first few rows of the DataFrame

# Drop irrelevant columns
data = data.drop(columns=['pos_tags', 'syntax_tree'])



In [18]:
X = data.iloc[:, 1:]  
y = data['is_sarcastic']

# Split the data into training, validation, and test sets
SEED = 42
train_headlines, other_headlines = split(data, test_size=0.3, random_state=SEED)  # 70% training
val_headlines, test_headlines = split(other_headlines, test_size=0.5, random_state=SEED)  # 15% validation, 15% test

# Prepare data for the Logistic Regression model
# Extract the corresponding feature values for train, validation, and test
X_train = X.loc[train_headlines.index]
y_train = y.loc[train_headlines.index]
X_val = X.loc[val_headlines.index]
y_val = y.loc[val_headlines.index]
X_test = X.loc[test_headlines.index]
y_test = y.loc[test_headlines.index]



In [19]:

synX = X.drop(columns=['headline', 'article_link'])  # Drop text columns


# ANOVA F-test
select_k_best = SelectKBest(score_func=f_classif, k='all')
anova_scores = select_k_best.fit(synX, y).scores_

# Recursive Feature Elimination (RFE) with Logistic Regression
log_model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=log_model, n_features_to_select=5)  # Select top 5 features
rfe.fit(synX, y)
rfe_ranks = rfe.ranking_

# Feature Importance from Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(synX, y)
rf_importances = rf_model.feature_importances_

# Scaler
scaler = MinMaxScaler()

# Scale to [0, 1]
anova_scaled = scaler.fit_transform(anova_scores.reshape(-1, 1)).flatten()
rfe_scaled = scaler.fit_transform((1 / rfe_ranks).reshape(-1, 1)).flatten()  # Inverse for ranking
rf_scaled = scaler.fit_transform(rf_importances.reshape(-1, 1)).flatten()

feature_scores = {
    'ANOVA_F': anova_scaled,
    'RFE': rfe_scaled,
    'RandomForest': rf_scaled
}
importance_df = pd.DataFrame(feature_scores, index=synX.columns)

# Add average score
importance_df['Average_Score'] = importance_df.mean(axis=1)
importance_df = importance_df.sort_values(by='RFE', ascending=False)

print("Feature Importance Scores:")
print(importance_df)

# Select features based on threshold
selected_features = importance_df[importance_df['RFE'] > 0.05].index.tolist()
display(selected_features)




Feature Importance Scores:
                         ANOVA_F       RFE  RandomForest  Average_Score
ratio_func_words        1.000000  1.000000      0.822441       0.940814
ratio_unique_pos        0.365082  1.000000      0.560617       0.641899
ratio_verb              0.556911  1.000000      0.552047       0.702986
ratio_adv               0.276317  1.000000      0.223174       0.499830
ratio_pron              0.142221  1.000000      0.170824       0.437682
ratio_propn             0.145956  0.444444      0.390643       0.327014
ratio_adj               0.104327  0.259259      0.324332       0.229306
average_surprisingness  0.034888  0.166667      1.000000       0.400518
branching_factor        0.423827  0.111111      0.539102       0.358014
reversed_probability    0.000000  0.074074      0.984023       0.352699
ratio_noun              0.292966  0.047619      0.462218       0.267601
syntactic_depth         0.286037  0.027778      0.298940       0.204252
pos_repetitions         0.008487  0.0

['ratio_func_words',
 'ratio_unique_pos',
 'ratio_verb',
 'ratio_adv',
 'ratio_pron',
 'ratio_propn',
 'ratio_adj',
 'average_surprisingness',
 'branching_factor',
 'reversed_probability']

In [20]:
def get_top_features(X_train, importance_df, method):
    """
    Returns the X data (including the headlines) that contains the top 5 syntactic features
    based on the specified feature selection method.
    
    Parameters:
    X_train (pd.DataFrame): The training data including headlines.
    importance_df (pd.DataFrame): DataFrame containing feature importance scores.
    method (str): The feature selection method ('ANOVA_F', 'RFE', 'RandomForest').
    
    Returns:
    pd.DataFrame: DataFrame containing the top 5 syntactic features and headlines.
    """
    if method not in importance_df.columns:
        raise ValueError(f"Method {method} not found in importance_df columns.")
    
    # Select the top 5 features based on the specified method
    top_features = importance_df.nlargest(5, method).index.tolist()
    
    # Include the 'headline' column
    top_features_with_headline = ['headline'] + top_features
    
    # Return the X data with the selected features
    return X_train[top_features_with_headline]


In [21]:

top_features_df = get_top_features(X_train, importance_df, 'RandomForest')
display(top_features_df.head())

Unnamed: 0,headline,average_surprisingness,reversed_probability,ratio_func_words,ratio_unique_pos,ratio_verb
12170,american express to offer 5 months of paternit...,0.870541,0.185371,0.272727,0.636364,0.272727
28552,watch: dolphin knocks stand-up paddleboarder o...,0.927349,0.517047,0.363636,0.545455,0.272727
6883,man who enjoys thing informed he is wrong,0.98149,0.879098,0.125,0.625,0.25
28387,jonathan lipnicki to star as young 'dark helme...,0.921614,0.301929,0.384615,0.538462,0.153846
12932,publicist worried kanye west's support of trum...,0.808175,0.004917,0.272727,0.5,0.181818
