In [38]:
import os
import sys

# Set Java 17 as the Java version for PySpark
# PySpark 3.5.7 requires Java 17, can't do with Java 25
os.environ['JAVA HOME'] = '/Program Files/Eclipse Adoptium/jdk-17.0.17.10-hotspot'
os.environ['PATH'] = os.environ['JAVA_HOME'] + '/bin;' + os.environ['PATH']

# Verify Java version
import subprocess
result = subprocess.run(['java','-version'], capture_output = True,text=True)
print("Java version:", result.stderr.split('\n')[0])

# Set Python executable for PySpark
os.environ['PYSPARK_PYTHON'] = sys.executable

Java version: openjdk version "17.0.17" 2025-10-21


In [39]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [40]:
import os
print("HADOOP_HOME =", os.environ.get("HADOOP_HOME"))

if os.environ.get("HADOOP_HOME"):
    path = os.path.join(os.environ["HADOOP_HOME"], "bin", "winutils.exe")
    print("winutils exists:", os.path.exists(path), "->", path)
else:
    print("❌ Hadoop not configured in this session")

HADOOP_HOME = D:\Code\hadoop-3.4.0\hadoop-3.4.0
winutils exists: True -> D:\Code\hadoop-3.4.0\hadoop-3.4.0\bin\winutils.exe


In [41]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark in venv") \
    .config("spark.cores.max", "4") \
    .config('spark.executor.memory', '8G') \
    .config('spark.driver.maxResultSize', '8g') \
    .config('spark.kryoserializer.buffer.max','512m') \
    .config("spark.driver.cores", "4") \
    .config("spark.pyspark.python", sys.executable)\
    .config("spark.pyspark.driver.python", sys.executable)\
    .config("spark.python.use.daemon", "false") \
    .config("spark.python.worker.reuse", "false") \
    .getOrCreate()

sc = spark.sparkContext  # SparkContext对象

print("Using Apache Spark Version", spark.version)
print(sys.executable)

Using Apache Spark Version 3.5.7
D:\Columbia\Fall2025\5400\SQL.venv\Scripts\python.exe


In [42]:
df = spark.read.format("csv").options(
    header = 'true',
    inferschema = 'true',
    treatEmptyValuesAsNulls = 'true'
).load('All_external.csv')
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Article_title: string (nullable = true)
 |-- Stock_symbol: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- Article: string (nullable = true)
 |-- Lsa_summary: string (nullable = true)
 |-- Luhn_summary: string (nullable = true)
 |-- Textrank_summary: string (nullable = true)
 |-- Lexrank_summary: string (nullable = true)



In [43]:
print("Count of all records:", df.count())

Count of all records: 29984720


In [44]:
df.columns

['Date',
 'Article_title',
 'Stock_symbol',
 'Url',
 'Publisher',
 'Author',
 'Article',
 'Lsa_summary',
 'Luhn_summary',
 'Textrank_summary',
 'Lexrank_summary']

### Bronze Layer

In [8]:
try:
    spark.stop()
except:
    pass


In [45]:
# Initialize MongoDB and database
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client.apan5400

In [None]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = client["news_elt"]       # 数据库名字
col = db["fnspid_bronze"]     # Bronze 层 collection


In [None]:
col.delete_many({})  # 覆盖写，可选
col.insert_many(df_bronze_pd.to_dict("records"))

print("✅ Bronze 层写入到 Docker MongoDB 完成")


In [47]:
import pandas as pd
import os

# ⭐ 请将此路径替换为您主机上 ALL_external.csv 文件的实际路径
csv_file = r'D:\Columbia\Fall2025\5400\project\ALL_external.csv'
parquet_dir = r'D:\Columbia\Fall2025\5400\project\bronze_parquet_output'

# 确保输出目录存在
os.makedirs(parquet_dir, exist_ok=True)
print(f"Reading from: {csv_file}")
print(f"Writing to: {parquet_dir}")

# --- 分块读取 CSV 并写入 Parquet ---
chunksize = 1000000  # 每次读取 100 万行
chunk_num = 0

try:
    for chunk in pd.read_csv(csv_file, chunksize=chunksize, low_memory=False, dtype=str):
        output_file = os.path.join(parquet_dir, f'part-{chunk_num:05d}.parquet')
        chunk.to_parquet(output_file, index=False, engine='pyarrow')
        chunk_num += 1
        print(f"Processed chunk {chunk_num}. Saved to {output_file}")
    print("✅ Bronze 层 Pandas 转换完成！")

except Exception as e:
    print(f"❌ FATAL ERROR in Pandas conversion: {e}")

Reading from: D:\Columbia\Fall2025\5400\project\ALL_external.csv
Writing to: D:\Columbia\Fall2025\5400\project\bronze_parquet_output
❌ FATAL ERROR in Pandas conversion: No type extension with name arrow.py_extension_type found
