In [None]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
import ast

In [None]:
df = pd.read_csv("/Users/barnabasepres/University/TDK/trump_speech_analysis/word2vec_vectors.csv")
df['Vector2'] = df['Vector'].apply(ast.literal_eval)
def getVectorList(vector):
    vector_list = list()
    for i in vector[0]:
        vector_list.append(i)

    return vector_list

df['Vector3'] = df['Vector2'].apply(getVectorList)

In [None]:
#merge data for modelling with vectors
df_for_model = pd.read_excel("data_for_rfmodel.xlsx")
df_for_model.drop(columns=["tfidf1", "tfidf2", "tfidf3", "tfidf4", "tfidf5"])

In [None]:
df['textid2'] = 'text' + df['textid'].astype(str)
df.head(1)

num_vectors = 5
num_rows = df.shape[0] // num_vectors  # Ensure it is divisible

# Create an empty DataFrame with proper structure
vector_form_models = pd.DataFrame(columns=[f'Vector{i+1}' for i in range(num_vectors)])

# Reshape df column 8 into 5 columns
vector_for_models = pd.DataFrame(df.iloc[:num_rows * num_vectors, 8].values.reshape(num_rows, num_vectors),
                            columns=[f'Vector{i+1}' for i in range(num_vectors)])

vector_for_models.head(1)

In [None]:
vector_for_models['doc_id'] = np.arange(1, 74) 
vector_for_models['doc_id'] = 'text' + vector_for_models['doc_id'].astype(str)

df_for_model_final = pd.merge(df_for_model, vector_for_models, on='doc_id')


df_for_model_final['Vector1_mean'] = df_for_model_final['Vector1'].apply(np.mean)
df_for_model_final['Vector2_mean'] = df_for_model_final['Vector2'].apply(np.mean)
df_for_model_final['Vector3_mean'] = df_for_model_final['Vector3'].apply(np.mean)
df_for_model_final['Vector4_mean'] = df_for_model_final['Vector4'].apply(np.mean)
df_for_model_final['Vector5_mean'] = df_for_model_final['Vector5'].apply(np.mean)
df_for_model_final.head(2)

df_for_model_final.to_csv("df_for_model_final_hpfilter.csv")

In [None]:
#get packeges for random forest models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt

In [None]:
#build model for predicting sentiment score
#data preprocessing for random forest regression
encoder = LabelEncoder()
df_for_model_final['state'] = encoder.fit_transform(df_for_model_final['state'])

y = "positive"
y_data = df_for_model_final[y]
X_data = df_for_model_final[["day", "day_of_the_week", "state", "nth_speech_in_state", "CTTR", 
                             "scale", "popularity", "frequency", "Vector1_mean", "Vector2_mean", "Vector3_mean", 
                             "Vector4_mean", "Vector5_mean"]]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=2024)

rf_model = RandomForestRegressor(n_estimators=100, random_state=2024, max_depth=5, min_samples_leaf=6)
rf_model.fit(X_train, y_train)

#predicting 
y_pred = rf_model.predict(X_test)
y_pred_on_train = rf_model.predict(X_train)
mse = mean_squared_error(y_test, y_pred)
mse2 = mean_squared_error(y_train, y_pred_on_train)
r2 = r2_score(y_test, y_pred)
r2_train = r2_score(y_train, y_pred_on_train)
print(mse, mse2, r2, r2_train)

In [None]:
y_test = y_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
plt.plot(y_pred_on_train, label='Predicted positive sentiment score')
plt.plot(y_train, label='Actual positive sentiment score')
plt.legend()


In [None]:
X10 = shap.utils.sample(X_train, 10)
explainer = shap.Explainer(rf_model.predict, X10)

shap_values = explainer(X_train)
shap.plots.waterfall(shap_values[1], max_display=14)

shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.text_auto_precision = 4
X10 = shap.utils.sample(X_train, 10)
explainer = shap.Explainer(rf_model.predict, X10)
shap_values = explainer(X_train)

# Set precision (e.g., 4 digits after decimal)
np.set_printoptions(precision=4, suppress=True)

# Sample explainer and compute SHAP values
X10 = shap.utils.sample(X_train, 10, random_state=42)
explainer = shap.Explainer(rf_model.predict, X10)
shap_values = explainer(X_train)

# Waterfall plot (with high precision display in hover text)
plt.figure(figsize=(10, 6))
shap.plots.waterfall(shap_values[1], max_display=14, show=False)
plt.title("SHAP Waterfall Plot with High-Precision Values", fontsize=14)
plt.tight_layout()
plt.show()

# 🐝 Beeswarm plot for feature importance
plt.figure(figsize=(12, 6))
shap.plots.beeswarm(shap_values, max_display=14, show=False)
plt.title("SHAP Beeswarm Plot — Feature Importance Overview", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
pred = rf_model.predict(X_train.head(1))
pred

In [None]:
# Official SHAP colors
shap_blue = "#008bfb" # #0080FF
shap_red = "#ff0051" 
# Choose a specific instance
shap_val = shap_values[0]
base_value = shap_val.base_values
contribs = shap_val.values
feature_names = shap_val.feature_names
feature_vals = shap_val.data

# Create DataFrame
df = pd.DataFrame({
    'Feature': feature_names,
    'SHAP': contribs,
    'Value': feature_vals
})

# Sort by absolute SHAP value
df = df.reindex(df['SHAP'].abs().sort_values(ascending=False).index)
df = df.head(14).reset_index(drop=True)

# Assign colors based on SHAP value sign (positive=blue, negative=red)
colors = [shap_blue if val > 0 else shap_red for val in df['SHAP']]

# Calculate left edge for bars
cumulative = base_value + df['SHAP'].cumsum() - df['SHAP']

# Plot
plt.figure(figsize=(10, 6))
for i, row in df.iterrows():
    plt.barh(i, row['SHAP'], left=cumulative[i], color=colors[i])
    label = f"{row['SHAP']:.4f}"
    plt.text(cumulative[i] + row['SHAP'] / 2, i, label,
             va='center', ha='center', fontsize=9, color='black')

# Y-axis labels
plt.yticks(
    ticks=range(len(df)),
    labels=[f"{f} = {v:.4f}" for f, v in zip(df['Feature'], df['Value'])]
)

plt.title("Individual feature importance", fontsize=14)
plt.xlabel("Prediction Contribution")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("SHAP values explain this instance:")
print(pd.Series(feature_vals, index=feature_names))

In [None]:
df_for_model.columns

In [None]:
#build model for predicting popularity
y = "popularity_tminus1"
y_data = df_for_model_final[y]
X_data = df_for_model_final[["day", "state", "nth_speech_in_state", "CTTR", 
                             "scale", "positive", "frequency", "Vector1_mean", "Vector2_mean", "Vector3_mean", 
                             "Vector4_mean", "Vector5_mean"]]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=2024)

rf_model = RandomForestRegressor(n_estimators=100, random_state=2024, max_depth=5, min_samples_leaf=6)
rf_model.fit(X_train, y_train)

#predicting 
y_pred = rf_model.predict(X_test)
y_pred_on_train = rf_model.predict(X_train)
mse = mean_squared_error(y_test, y_pred)
mse2 = mean_squared_error(y_train, y_pred_on_train)
r2 = r2_score(y_test, y_pred)
r2_for_train = r2_score(y_train, y_pred_on_train)
print(mse, mse2, r2, r2_for_train)

In [None]:
X10 = shap.utils.sample(X_test, 10)
explainer = shap.Explainer(rf_model.predict, X10)

shap_values = explainer(X_test)
shap.plots.waterfall(shap_values[2], max_display=14)

shap.plots.beeswarm(shap_values)

In [None]:
#build model for predicting CTTR
y = "CTTR"
y_data = df_for_model_final[y]
X_data = df_for_model_final[["day", "day_of_the_week", "state", "nth_speech_in_state", "scale", 
                             "popularity", "positive", "frequency", "Vector1_mean", "Vector2_mean", "Vector3_mean", 
                             "Vector4_mean", "Vector5_mean"]]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=2024)

rf_model = RandomForestRegressor(n_estimators=100, random_state=2024)
rf_model.fit(X_train, y_train)

#predicting 
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse, r2)

In [None]:
shap.plots.text_auto_precision = 4
X10 = shap.utils.sample(X_train, 10)
explainer = shap.Explainer(rf_model.predict, X10)
shap_values = explainer(X_train)

# Set precision (e.g., 4 digits after decimal)
np.set_printoptions(precision=4, suppress=True)

# Sample explainer and compute SHAP values
X10 = shap.utils.sample(X_train, 10, random_state=42)
explainer = shap.Explainer(rf_model.predict, X10)
shap_values = explainer(X_train)

# Waterfall plot (with high precision display in hover text)
plt.figure(figsize=(10, 6))
shap.plots.waterfall(shap_values[1], max_display=14, show=False)
plt.title("SHAP Waterfall Plot with High-Precision Values", fontsize=14)
plt.tight_layout()
plt.show()

# 🐝 Beeswarm plot for feature importance
plt.figure(figsize=(12, 6))
shap.plots.beeswarm(shap_values, max_display=14, show=False)
plt.title("SHAP Beeswarm Plot — Feature Importance Overview", fontsize=14)
plt.tight_layout()
plt.show()