In [1]:
import pandas as pd
import os

# Define your base directory
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Update file paths based on your structure
files = {
    'news': os.path.join(base_dir, 'Data', 'Data', 'raw_analyst_ratings', 'raw_analyst_ratings.csv'),
    'tsla': os.path.join(base_dir, 'Data', 'Data', 'yfinance_data', 'yfinance_data', 'TSLA_historical_data.csv')
    # Add other files as needed
}

# Load datasets
news_df = pd.read_csv(files['news'])
tsla_df = pd.read_csv(files['tsla'])

# Step 1: Convert date columns to datetime
# Check if columns exist
if 'date' in news_df.columns:
    news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')
if 'Date' in tsla_df.columns:
    tsla_df['Date'] = pd.to_datetime(tsla_df['Date'], errors='coerce')

# Step 2: Print data types for verification
print("After conversion:")
print("tsla 'Date' dtype:", tsla_df['Date'].dtype)
print("news 'date' dtype:", news_df['date'].dtype)

# Step 3: Drop rows with NaT in date columns
tsla_df = tsla_df.dropna(subset=['Date'])
news_df = news_df.dropna(subset=['date'])

# Step 4: Verify first few entries
print("TSLA 'Date' sample:\n", tsla_df['Date'].head())
print("News 'date' sample:\n", news_df['date'].head())

# Step 5: Perform the merge
try:
    merged_df = pd.merge(tsla_df, news_df, left_on='Date', right_on='date', how='inner')
    print("Merge successful. Sample data:")
    print(merged_df.head())
except Exception as e:
    print("Error during merge:", e)

# Optional: Save merged data
# merged_df.to_csv('merged_tsla_news.csv', index=False)

After conversion:
tsla 'Date' dtype: datetime64[ns]
news 'date' dtype: datetime64[ns, UTC-04:00]
TSLA 'Date' sample:
 0   2010-06-29
1   2010-06-30
2   2010-07-01
3   2010-07-02
4   2010-07-06
Name: Date, dtype: datetime64[ns]
News 'date' sample:
 0   2020-06-05 10:30:54-04:00
1   2020-06-03 10:45:20-04:00
2   2020-05-26 04:30:07-04:00
3   2020-05-22 12:45:06-04:00
4   2020-05-22 11:38:59-04:00
Name: date, dtype: datetime64[ns, UTC-04:00]
Error during merge: You are trying to merge on datetime64[ns] and datetime64[ns, UTC-04:00] columns for key 'Date'. If you wish to proceed you should use pd.concat
