In [None]:
# Replace 'ola.json' with your actual file path if needed
# Data handling
import json
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sentiment analysis (TextBlob)
# !pip install textblob --quiet  # run this only once in a notebook cell to install
from textblob import TextBlob

# For handling the mode calculation
from statistics import mode

# (Optional) to suppress warnings in a notebook
import warnings
warnings.filterwarnings('ignore')
file_path = 'ola.json'
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
print("=== 2(a) Basic Statistics for Engagement Metrics ===")
engagement_metrics = ['likes', 'shares', 'views']

for metric in engagement_metrics:
    # Convert column to numeric if not already (safeguard)
    df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0).astype(int)
    
    col_data = df[metric]
    
    # Calculate mean, median, mode. Handle multiple or missing modes gracefully:
    try:
        col_mode = mode(col_data)
    except:
        col_mode = "No unique mode"
    
    print(f"\n--- {metric.upper()} ---")
    print(f"Mean  : {col_data.mean():.2f}")
    print(f"Median: {col_data.median():.2f}")
    print(f"Mode  : {col_mode}")

# 2(b) Identify the top 5 posts with the highest engagement
#     Engagement = likes + shares + number_of_comments

# Count how many comments each post has
df['num_comments'] = df['comments'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Create a total_engagement column
df['total_engagement'] = df['likes'] + df['shares'] + df['num_comments']

# Get the top 5 rows by total_engagement
top_5 = df.nlargest(5, 'total_engagement')

print("\n=== 2(b) Top 5 Posts by Engagement (likes + shares + comments) ===")
print(top_5[['tweet_id', 'content', 'likes', 'shares', 'num_comments', 'total_engagement']])

# 2(c) Count the number of posts with media attachments
df['has_media'] = df['media'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)
media_count = df['has_media'].sum()

print("\n=== 2(c) Number of posts with media attachments ===")
print("Posts with media:", media_count)

Basic Statistics for Engagement Metrics:

--- LIKES ---
Mean  : 154.17
Median: 0.00
Mode  : 0

--- SHARES ---
Mean  : 35.26
Median: 0.00
Mode  : 0

--- VIEWS ---
Mean  : 20445.62
Median: 51.00
Mode  : 9.0

Top 5 Posts by Engagement:
                tweet_id                                            content  \
30   1649137934841544704  HELP WANTED Tons of PAID work opportunities fo...   
592  1758125175714021772                             lay off 17.50 5 access   
382  1842896103618048240  Who said the job market is bad? Just complain ...   
723  1329638961308962816  I am getting laid off 12/31 but I am going to ...   
202  1842882565209723381  While the OLA electric CEO rants on someone el...   

                datetime  likes  shares      views   source  isBlue  \
30   20-04-2023 19:48:17  20725    2706  2272445.0  TWITTER   False   
592  15-02-2024 13:44:40   2788   17964  1136318.0  TWITTER   False   
382  06-10-2024 11:54:05  18664    1823   317629.0  TWITTER   False   
723  20-