In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
try:
    df_tweets = pd.read_csv(
        "/kaggle/input/price-and-tweets/tweets_data.csv",
        engine="python",
        encoding="latin1",
        on_bad_lines="skip",
        dtype={
            "id": str,
            "replies": "float64",
            "likes": "float64",
            "retweets": "float64"
        }
    )
    
    # Post-processing
    df_tweets["timestamp"] = pd.to_datetime(df_tweets["timestamp"], errors="coerce")
    df_tweets = df_tweets.dropna(subset=["timestamp"])
    
except pd.errors.ParserError as e:
    print(f"Parser error: {e}")

In [3]:
# Step 4: Aggregate
daily_tweets = df_tweets.groupby(pd.Grouper(key="timestamp", freq="D")).agg({
    "text": list,
    "likes": "sum",
    "retweets": "sum",
    "replies": "sum"
}).reset_index()

In [4]:
daily_tweets.head()

Unnamed: 0,timestamp,text,likes,retweets,replies
0,2016-01-01 00:00:00+00:00,[Current price 43105 BTCUSD btc 20151231 2000...,40.0,255.0,4.0
1,2016-01-02 00:00:00+00:00,"[43705 \n43500 \n43407 \n42990, Current pri...",79.0,345.0,10.0
2,2016-01-03 00:00:00+00:00,[on the exchanges\nCryptsy 000000004\nBittrex ...,36.0,221.0,6.0
3,2016-01-04 00:00:00+00:00,[1 12678 TL 426876 393314 GBP 3001700 RUR ...,52.0,203.0,7.0
4,2016-01-05 00:00:00+00:00,[Current price 39945 BTCEUR btc 20160105 0200...,28.0,141.0,0.0


In [5]:
daily_tweets.tail()

Unnamed: 0,timestamp,text,likes,retweets,replies
1090,2018-12-26 00:00:00+00:00,[1H\n20181226 1000 20181226 0859\n\nLONG 2596...,7490.0,2749.0,837.0
1091,2018-12-27 00:00:00+00:00,[24H\n20181227 1000 20181226 1000\nLONG 25683...,4631.0,2864.0,1368.0
1092,2018-12-28 00:00:00+00:00,[1H\n20181228 1000 20181228 0900\n\nLONG 2558...,11569.0,7480.0,755.0
1093,2018-12-29 00:00:00+00:00,[24H\n20181229 1000 20181228 1000\nLONG 26757...,15755.0,4603.0,943.0
1094,2018-12-30 00:00:00+00:00,[24H\n20181230 1000 20181229 1000\nLONG 28632...,18562.0,6907.0,1260.0


In [6]:
print("Shape:", daily_tweets.shape)

Shape: (1095, 5)


In [7]:
daily_tweets.dtypes

timestamp    datetime64[ns, UTC]
text                      object
likes                    float64
retweets                 float64
replies                  float64
dtype: object

In [8]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BERT model with memory optimizations
model = BertModel.from_pretrained(
    "bert-base-uncased",
    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
).to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def generate_embeddings(text_series):
    """Generate individual embeddings for each text entry"""
    texts = text_series.tolist()
    embeddings = []
    batch_size = 2  # Small batch size for BERT's memory requirements
    
    # Process with proper batch stepping and progress bar
    for i in tqdm(range(0, len(texts), batch_size), 
                 desc="Generating embeddings",
                 unit="batch"):
        batch_texts = texts[i:i+batch_size]
        
        # Clean and prepare batch
        cleaned_batch = [
            str(t).replace('\n', ' ').strip()[:128]
            for t in batch_texts
        ]
        
        # Tokenize batch
        inputs = tokenizer(
            cleaned_batch,
            padding=True,
            truncation=True,
            max_length=64,
            return_tensors="pt"
        )
        
        # Move to GPU and use mixed precision
        with torch.autocast(device_type=device.type, dtype=torch.float16):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model(**inputs)
        
        # Get individual embeddings (batch_size, 768)
        batch_emb = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_emb)
        
        # Memory cleanup
        del inputs, outputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    return np.array(embeddings[:len(texts)]).astype(np.float16)

# Generate and store embeddings
embeddings_array = generate_embeddings(daily_tweets["text"])
daily_tweets["embeddings"] = list(embeddings_array)

# Verify results
print(f"Data shape: {embeddings_array.shape}")  # Should be (1095, 768)
print(f"First embedding: {daily_tweets['embeddings'].iloc[0].shape}")

# Save results
np.save("embeddings.npy", embeddings_array)
daily_tweets[["timestamp", "likes", "retweets", "replies"]].to_csv("metadata.csv", index=False)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 548/548 [00:09<00:00, 57.28batch/s]


Data shape: (1095, 768)
First embedding: (768,)


In [9]:
print(f"Total days processed: {len(daily_tweets)}")
print(f"Embedding dimensions: {daily_tweets['embeddings'].iloc[0].shape}")

Total days processed: 1095
Embedding dimensions: (768,)


In [10]:
import pandas as pd
import numpy as np

df_prices=pd.read_csv("/kaggle/input/price-and-tweets/prices_data.csv")

In [11]:
# Remove last row (modifies DataFrame in-place)
df_prices.drop(df_prices.index[-1], inplace=True)

# Verify
print(f"New shape: {df_prices.shape}")

New shape: (1095, 7)


In [12]:

# 1. Add embeddings to the DataFrame
df_prices["embeddings"] = list(embeddings_array)

# 2. Verify the merged dataset
print(df_prices.head())
print(f"\nDataset shape: {df_prices.shape}")
print(f"Embedding column type: {type(df_prices['embeddings'].iloc[0])}")

# Alternative: Save as CSV (note: embeddings will be stored as strings)
df_prices.to_csv("prices_data_with_BERT_embeddings.csv", index=False)

         Date        Open        High         Low       Close   Adj Close  \
0  2016-01-01  430.721008  436.246002  427.515015  434.334015  434.334015   
1  2016-01-02  434.622009  436.062012  431.869995  433.437988  433.437988   
2  2016-01-03  433.578003  433.743011  424.705994  430.010986  430.010986   
3  2016-01-04  430.061005  434.516998  429.084015  433.091003  433.091003   
4  2016-01-05  433.069000  434.182007  429.675995  431.959991  431.959991   

     Volume                                         embeddings  
0  36278900  [0.0984, -0.0288, 0.7744, -0.1802, 0.423, -0.0...  
1  30096600  [0.01646, 0.04642, 0.4675, -0.3696, 0.57, 0.18...  
2  39633800  [0.1278, 0.0617, 0.6367, -0.1098, 0.4893, 0.10...  
3  38477500  [-0.093, 0.069, 0.6196, -0.4355, 0.4941, 0.099...  
4  34522600  [0.0373, 0.02124, 0.691, -0.369, 0.4712, -0.14...  

Dataset shape: (1095, 8)
Embedding column type: <class 'numpy.ndarray'>
