# Data Sampling

## Twitter Info

In [24]:
import pandas as pd

# Read the CSV file
df_twitter = pd.read_csv('../../data/clean/twitter_data.csv')

# Display basic information
print("Data preview (first 5 rows):")
print(df_twitter.head())
print("\nData shape (rows, columns):")
print(df_twitter.shape)
print("\nColumn names:")
print(df_twitter.columns.tolist())
print("\nData types:")
print(df_twitter.dtypes)
print("\nBasic statistical description:")
print(df_twitter.describe())
print("\nMissing value statistics:")
print(df_twitter.isnull().sum())

# If you want to see the number of unique values in each column
print("\nNumber of unique values per column:")
print(df_twitter.nunique())

Data preview (first 5 rows):
            Query Name                   Date  \
0  UnitedHealth_2month  2024-11-12 17:59:45.0   
1  UnitedHealth_2month  2024-11-12 17:59:02.0   
2  UnitedHealth_2month  2024-11-12 17:57:58.0   
3  UnitedHealth_2month  2024-11-12 17:57:07.0   
4  UnitedHealth_2month  2024-11-12 17:55:04.0   

                                           Full Text       Location Name  \
0  RT @CBSNews With ACA subsidies set to expire i...             AZ, USA   
1  RT @cromwick Moreover, isn't this 'rope was in...    Atlanta, GA, USA   
2  Cute girl sparked up a conversation with me ab...  Charlotte, NC, USA   
3  @unusual_whales Sad for those that wish they c...    Seattle, WA, USA   
4  RT @mfcannon Dear Congress,\n\nThese lobbyists...             VA, USA   

  Account Type           Author  Impact  Impressions  Engagement Score  \
0   individual  SafeH2o4Schools    36.1        13919               0.0   
1   individual     tgordonvideo     0.7         1769               0.0 

In [25]:
# Convert the 'Date' column to datetime type
df_twitter['Date'] = pd.to_datetime(df_twitter['Date'])

# Get the minimum and maximum dates
min_date = df_twitter['Date'].min()
max_date = df_twitter['Date'].max()

print(f"Date range from {min_date} to {max_date}")

Date range from 2024-11-04 10:01:00 to 2025-01-04 09:58:16


## Stratified Sampling Twitter

In [26]:
import pandas as pd
import numpy as np

# Set random seed to 0 for reproducibility
np.random.seed(0)

# Convert Date to datetime format and extract the date part
df_twitter['Date'] = pd.to_datetime(df_twitter['Date'])
df_twitter['Date_only'] = df_twitter['Date'].dt.date

# Set fixed target sample size to 1000
target_sample_size = 1000

# Get total rows and unique dates
total_rows = len(df_twitter)
unique_dates = df_twitter['Date_only'].unique()
n_dates = len(unique_dates)

# Step 1: Ensure at least 1 sample per date (minimum requirement)
min_samples_total = n_dates  # At least 1 sample per date
remaining_target = target_sample_size - min_samples_total  # Remaining samples to allocate proportionally

# Stratified sampling
sampled_df = pd.DataFrame()

# Step 2: Calculate proportional samples per date, ensuring at least 1
for date in unique_dates:
    day_data = df_twitter[df_twitter['Date_only'] == date]
    day_size = len(day_data)
    
    # Calculate proportional sample size based on date's share of total data
    proportional_size = int((day_size / total_rows) * remaining_target)
    
    # Ensure at least 1 sample, then add proportional amount
    samples_for_day = max(1, min(proportional_size + 1, day_size))
    
    # If calculated samples exceed available data, take all available
    if samples_for_day > day_size:
        sampled_day = day_data
    else:
        sampled_day = day_data.sample(n=samples_for_day, random_state=0)  # Use seed 0
    
    sampled_df = pd.concat([sampled_df, sampled_day])

# Step 3: Adjust sample size to exactly 1000 if needed
current_sample_size = len(sampled_df)
if current_sample_size < target_sample_size:
    # If under-sampled, add more from remaining data
    remaining_df = df_twitter[~df_twitter.index.isin(sampled_df.index)]
    additional_samples = remaining_df.sample(n=(target_sample_size - current_sample_size), 
                                           random_state=0)
    sampled_df = pd.concat([sampled_df, additional_samples])
elif current_sample_size > target_sample_size:
    # If over-sampled, randomly reduce to 1000
    sampled_df = sampled_df.sample(n=target_sample_size, random_state=0)

# Drop temporary column
sampled_df = sampled_df.drop('Date_only', axis=1)

# Save the result
sampled_df.to_csv('../../data/clean/twitter_sample.csv', index=False)

# Verify the results
print(f"Original data rows: {total_rows}")
print(f"Target sample size: {target_sample_size}")
print(f"Actual sampled rows: {len(sampled_df)}")
print(f"Sampling ratio: {len(sampled_df)/total_rows:.2%}")
print(f"Number of unique dates - original data: {n_dates}")
print(f"Number of unique dates - sampled data: {len(sampled_df['Date'].dt.date.unique())}")

Original data rows: 702699
Target sample size: 1000
Actual sampled rows: 1000
Sampling ratio: 0.14%
Number of unique dates - original data: 62
Number of unique dates - sampled data: 62


## Reddit Info

In [13]:
import pandas as pd

# Read the CSV file
df_reddit = pd.read_csv('../../data/clean/reddit_data.csv')

# Display basic information
print("Data preview (first 5 rows):")
print(df_reddit.head())
print("\nData shape (rows, columns):")
print(df_reddit.shape)
print("\nColumn names:")
print(df_reddit.columns.tolist())
print("\nData types:")
print(df_reddit.dtypes)
print("\nBasic statistical description:")
print(df_reddit.describe())
print("\nMissing value statistics:")
print(df_reddit.isnull().sum())

# If you want to see the number of unique values in each column
print("\nNumber of unique values per column:")
print(df_reddit.nunique())

Data preview (first 5 rows):
        id  type parent_id                                               text  \
0  1hq8tz0  post       NaN  More LM subs being mass censored by power hung...   
1  1hpi74v  post       NaN  Prison inmates show solidarity with Luigi Mang...   
2  1hphx7f  post       NaN  Censoring the Luigi Mangione information on Re...   
3  1hphwjh  post       NaN             Luigi censorship: why they are afraid?   
4  1hphwzw  post       NaN            The Luigi censorship; a running thread.   

     author  subreddit  score  upvote_ratio  num_comments  \
0  libghost  FR33LUIGI      3          0.80           0.0   
1  libghost  FR33LUIGI      3          0.71           0.0   
2  libghost  FR33LUIGI      3          0.80           0.0   
3  libghost  FR33LUIGI      3          1.00           0.0   
4  libghost  FR33LUIGI      1          0.60           0.0   

                                                 url  
0  https://reddit.com/r/FR33LUIGI/comments/1hq8tz...  
1  http