In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

# Define the file path
file_path = "/content/drive/MyDrive/CYON_Analysis_Materials/integrated_simul_generation_Oct18_PROCESSED.csv"

# Read the CSV file into DataFrame df1
try:
    df1 = pd.read_csv(file_path)
    print("File loaded successfully.")
    print("First 5 rows of your data:")
    print(df1.head())
    print("\nDataFrame Info:")
    df1.info()
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    print("Please check the file path and try again.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Clean S1–S4: remove "The " prefix and leading/trailing spaces
for col in ['S1', 'S2', 'S3', 'S4']:
    df1[col] = (
        df1[col]
        .astype(str)                              # Ensure all are strings
        .str.replace(r'^\s*The\s+', '', case=False, regex=True)  # Remove "The " (case-insensitive)
        .str.strip()                              # Remove leading/trailing spaces
        .replace({'nan': pd.NA, '': pd.NA})       # Convert empty or "nan" back to NA
    )

In [None]:
# 1️⃣ Frequency table across all four variables (total)
# Combine all media names from S1–S4 into a single Series
total_media = pd.concat([df1['S1'], df1['S2'], df1['S3'], df1['S4']])

# Drop missing values and count frequencies
total_freq = total_media.dropna().value_counts()

# Display frequency table
print("Total Frequency Table:")
print(total_freq)


In [None]:
# 2️⃣ Frequency tables for each variable (S1–S4)
# Frequency tables for each variable
freq_S1 = df1['S1'].value_counts(dropna=True)
freq_S2 = df1['S2'].value_counts(dropna=True)
freq_S3 = df1['S3'].value_counts(dropna=True)
freq_S4 = df1['S4'].value_counts(dropna=True)

# Display tables
print("\nFrequency Table - S1:")
print(freq_S1)
print("\nFrequency Table - S2:")
print(freq_S2)
print("\nFrequency Table - S3:")
print(freq_S3)
print("\nFrequency Table - S4:")
print(freq_S4)


In [None]:
# 3️⃣ Frequency plots (bar graphs) for total and each S variable
# Define a function for quick plotting
def plot_freq(series, title):
    plt.figure(figsize=(8, 4))
    series.head(20).plot(kind='bar')  # shows top 20 for readability
    plt.title(title)
    plt.xlabel("Media Name")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Total and per-variable plots
plot_freq(total_freq, "Total Media Frequency (S1–S4 Combined)")
plot_freq(freq_S1, "Media Frequency - S1")
plot_freq(freq_S2, "Media Frequency - S2")
plot_freq(freq_S3, "Media Frequency - S3")
plot_freq(freq_S4, "Media Frequency - S4")


In [None]:
# 4️⃣ Frequency tables by group (“DEM8”: Democrat vs. Republican)
# Grouped frequency tables
group_tables = {}

for group, subset in df1.groupby('DEM8'):
    combined = pd.concat([subset['S1'], subset['S2'], subset['S3'], subset['S4']])
    freq = combined.dropna().value_counts()
    group_tables[group] = freq
    print(f"\nFrequency Table for {group}:")
    print(freq)

In [None]:
# Frequency plots for each group
# Plot for each group
for group, freq in group_tables.items():
    plot_freq(freq, f"Media Frequency for {group}")

FANCY OPTIONS

In [None]:
import seaborn as sns

# Set a clean, professional style
sns.set(style="whitegrid")

def plot_freq(series, title, color_palette="viridis"):
    """
    Creates a professional horizontal bar plot for media frequency data.
    """
    # Convert to DataFrame for seaborn
    df_plot = series.reset_index()
    df_plot.columns = ['Media', 'Frequency']

    plt.figure(figsize=(9, 5))
    sns.barplot(
        data=df_plot.head(20),  # show top 20 for clarity
        y='Media',
        x='Frequency',
        palette=color_palette,
        edgecolor='black'
    )
    plt.title(title, fontsize=14, fontweight='bold', pad=12)
    plt.xlabel("Frequency", fontsize=12)
    plt.ylabel("Media Name", fontsize=12)
    plt.tight_layout()
    plt.show()


In [None]:
plot_freq(total_freq, "Total Media Frequency (S1–S4 Combined)")
plot_freq(freq_S1, "Media Frequency - S1")
plot_freq(freq_S2, "Media Frequency - S2")
plot_freq(freq_S3, "Media Frequency - S3")
plot_freq(freq_S4, "Media Frequency - S4")

In [None]:
for group, freq in group_tables.items():
    plot_freq(freq, f"Media Frequency for {group} Respondents")