## Importing all dependencies for the Notebook

In [None]:
import os, sys
import findspark
from pathlib import Path
import subprocess
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, to_date, year, month
import time 
import pandas as pd 
import matplotlib.pyplot as plt 


In [None]:
print("Python executable:", sys.executable)
print("Python version   :", sys.version)
print("JAVA_HOME before :", os.environ.get("JAVA_HOME"))

JAVA_HOME = "/opt/homebrew/Cellar/openjdk@11/11.0.29/libexec/openjdk.jdk/Contents/Home"
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PATH"] = os.path.join(JAVA_HOME, "bin") + ":" + os.environ["PATH"]

print("JAVA_HOME after  :", os.environ.get("JAVA_HOME"))

import subprocess
print("\njava -version from this kernel:")
print(subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT).decode())

findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("test_jvm")
        .getOrCreate()
)

print("\nSpark version:", spark.version)

# Tiny test job
spark.range(5).show()


In [None]:
try:
    project_root = Path(
        subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
        .decode()
        .strip()
    )
except Exception:
    project_root = Path.cwd().parent

print("PROJECT ROOT:", project_root)

raw_dir = project_root / "data" / "raw"
print("RAW DATA DIR:", raw_dir)

spark = (
    SparkSession.builder
        .appName("IRA_Tweets_Task1")
        .master("local[*]")
        .config("spark.driver.memory", "6g")      # adjust down/up depending on your RAM
        .config("spark.executor.memory", "6g")    # local = same as driver
        .config("spark.sql.shuffle.partitions", "4")
        .getOrCreate()
)

df = (
    spark.read
        .option("header", "true")
        .option("inferSchema", "true")
        .option("multiLine", "true")
        .option("escape", "\"")
        .csv(str(raw_dir))
)

df.printSchema()
print("Total rows:", df.count())

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")



In [None]:
numeric_cols = ["following", "followers", "updates", "retweet", "new_june_2018"]

for c in numeric_cols:
    df = df.withColumn(c, col(c).cast("int"))

df.select(numeric_cols).summary("count", "mean", "stddev", "min", "25%", "50%", "75%", "max") \
  .show(truncate=False)


In [None]:
total_tweets = df.count()
n_authors = df.select(countDistinct("author").alias("n_authors")).first()["n_authors"]
n_ext_authors = df.select(countDistinct("external_author_id").alias("n_ext_authors")).first()["n_ext_authors"]
avg_tweets_per_author = total_tweets / n_authors

summary_df = spark.createDataFrame(
    [(total_tweets, n_authors, n_ext_authors, avg_tweets_per_author)],
    ["total_tweets", "n_authors", "n_external_authors", "avg_tweets_per_author"]
)

summary_df.show(truncate=False)


In [None]:
# Languages
lang_dist = (
    df.groupBy("language")
      .count()
      .withColumn("pct", col("count") / total_tweets)
      .orderBy(desc("count"))
)
lang_dist.show(20, truncate=False)

# Regions
region_dist = (
    df.groupBy("region")
      .count()
      .withColumn("pct", col("count") / total_tweets)
      .orderBy(desc("count"))
)
region_dist.show(20, truncate=False)

# Account type / category / post type
df.groupBy("account_type").count().orderBy(desc("count")).show(20, truncate=False)
df.groupBy("account_category").count().orderBy(desc("count")).show(20, truncate=False)
df.groupBy("post_type").count().orderBy(desc("count")).show(20, truncate=False)

# Retweet vs original
retweet_dist = (
    df.groupBy("retweet")
      .count()
      .withColumn("pct", col("count") / total_tweets)
      .orderBy(desc("count"))
)
retweet_dist.show(truncate=False)


In [None]:
df_time = df.withColumn(
    "publish_ts",
    to_timestamp("publish_date", "M/d/yyyy H:mm")   # flexible 1 or 2 digits
)

df_time.select(
    min("publish_ts").alias("min_ts"),
    max("publish_ts").alias("max_ts")
).show(truncate=False)

df_daily = (
    df_time
        .withColumn("date", to_date("publish_ts"))
        .groupBy("date")
        .count()
        .orderBy("date")
)

df_daily.show(5)

daily_pd = df_daily.toPandas()
daily_pd["date"] = pd.to_datetime(daily_pd["date"])
daily_pd = daily_pd.sort_values("date")

plt.figure(figsize=(16,6))
plt.plot(daily_pd["date"], daily_pd["count"], linewidth=1)
plt.title("Daily Tweet Volume")
plt.xlabel("Date")
plt.ylabel("Tweets per Day")
plt.tight_layout()
plt.show()
