In [1]:
import json
import psycopg2
from functools import lru_cache
import time

# 建立数据库连接
def connect_db():
    conn = psycopg2.connect(
        host="localhost",
        dbname="twitter_1",
        user="postgres",
        password="yx253526",
        port="5432"
    )
    conn.autocommit = True
    return conn

# 缓存用户数据，避免重复查询数据库
@lru_cache(maxsize=1000)
def check_user_exists(user_id_str):
    with connect_db() as conn:
        with conn.cursor() as cursor:
            cursor.execute("SELECT 1 FROM user_table WHERE id_str = %s;", (user_id_str,))
            return cursor.fetchone() is not None

# 批量处理推文数据，减少数据库操作
def batch_process_tweets(tweet_data_list):
    with connect_db() as conn:
        with conn.cursor() as cursor:
            insert_query = """
            INSERT INTO user_table (id, id_str, name, screen_name, ...)
            VALUES (%s, %s, %s, %s, ...)
            ON CONFLICT (id_str) DO NOTHING;
            """
            cursor.executemany(insert_query, tweet_data_list)
            conn.commit()

# 主处理函数
def process_tweets(file_path):
    tweets_to_insert = []
    with open(file_path, "r") as file:
        for line_number, line in enumerate(file, 1):
            line = line.strip()
            if not line:
                continue
            
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                continue
            
            user_data = data['user']
            user_id_str = user_data['id_str']
            
            # 使用缓存检查用户是否存在
            if not check_user_exists(user_id_str):
                tweet_tuple = (user_data['id'], user_data['id_str'], user_data['name'], ...)
                tweets_to_insert.append(tweet_tuple)
            
            # 当积累足够的数据时进行批量处理
            if len(tweets_to_insert) >= 100:
                batch_process_tweets(tweets_to_insert)
                tweets_to_insert = []
    
    # 处理剩余的数据
    if tweets_to_insert:
        batch_process_tweets(tweets_to_insert)

# 调用主处理函数
process_tweets("corona-out-3")


ModuleNotFoundError: No module named 'psycopg2'