## Preprocessing 

In [21]:
# All imports
import pandas as pd
import pandas as pd
import re
import string
import nltk
import random
from datetime import datetime, timedelta
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

#### Preprocessing function

In [2]:
def preprocess_text(text):
 
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers and special characters
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\']', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the preprocessed words back into a single string
    preprocessed_text = ' '.join(words)

    return preprocessed_text

## 1) Kaggle Data (2013 - 2017)

In [3]:
News=pd.read_csv('crypto_news_parsed_2013-2017_train.csv') 

In [4]:
News1 = News.drop(['url','html','author','source'], axis=1)

In [5]:
News1.head()

Unnamed: 0,title,text,year
0,Bitcoin Price Update: Will China Lead us Down?,Bitcoin Priced in USD on Mt. GoxAbove is Gox p...,2013
1,Key Bitcoin Price Levels for Week 51 (15 – 22 ...,"The Bitcoin price up to 2013/12/14, with Fib l...",2013
2,"National Australia Bank, Citing Highly Flawed ...",National Australia Bank looms over Melbourne’s...,2013
3,Chinese Bitcoin Ban Driven by Chinese Banking...,Recent reports from China indicate there may b...,2013
4,Bitcoin Trade Update: Opened Position,"Opened 1st Bitcoin trade position, price curre...",2013


In [6]:
News1 = News1.dropna() #getting rid of full empty rows
print(News1.count())

title    28066
text     28066
year     28066
dtype: int64


In [7]:
# Apply the preprocess_text function to the DataFrame column
News1['preprocessed_title'] = News1['title'].apply(preprocess_text)

In [8]:
# Apply the preprocess_text function to the DataFrame column
News1['preprocessed_text'] = News1['text'].apply(preprocess_text)

In [22]:
#combining both
processed = News1['preprocessed_text'] + News1['preprocessed_title']
# Create a new DataFrame for further process
processed_kaggle = pd.DataFrame(processed)
processed_kaggle.columns = ['news']

start_date = datetime(2013, 1, 1)  
end_date = datetime(2017, 12, 31)
processed_kaggle['date'] = News1['year'].apply(lambda year: start_date + timedelta(days=random.randint(0, (end_date - start_date).days)))

processed_kaggle

Unnamed: 0,news,date
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15
1,bitcoin price fib level dmas ok last week like...,2017-05-12
2,national australia bank loom melbourne citizen...,2017-09-19
3,recent report china indicate may bitcoin ban m...,2014-10-03
4,opened st bitcoin trade position price current...,2016-01-15
...,...,...
28064,modern human remember dozen logins password ac...,2013-08-25
28065,former u secret service agent sentenced month ...,2013-09-09
28066,head shanghai financial service office called ...,2014-08-26
28067,cryptocurrency expert punting bitcoin price re...,2014-08-03


## 2) Scraped crypto News

In [11]:
Crnews=pd.read_csv('crypto_news.csv')
Crnews1 = Crnews.drop(['URL'], axis=1)
Crnews1

Unnamed: 0,Title,Date
0,"Bitcoin Price Prediction: 'Uptober' Surge, US ...",2023-10-03 09:03:00
1,"Analyst Lists 5 Reasons BTC Price Can Hit $40,...",2023-10-02 21:04:00
2,Bitcoin Price Prediction as BTC Blasts Up 8% –...,2023-10-02 19:00:00
3,"Bitfarms Mines 411 BTC in September, Increases...",2023-10-02 18:53:00
4,Bitcoin Price Prediction: BTC Rise 8% Amid ETF...,2023-10-02 08:00:00
...,...,...
9442,Market is Caught In a Tug-of-War,2018-05-22 09:43:00
9443,Market Weekly Outlook: Bitcoin and Altcoins Bo...,2018-05-21 09:48:00
9444,Bitcoin and Altcoins are Consolidating Losses,2018-05-18 09:17:00
9445,Bitcoin and Altcoins Under Pressure Despite Co...,2018-05-17 09:38:00


In [12]:
Crnews1 = Crnews1.dropna() #getting rid of full empty rows
print(Crnews1.count())

Title    9447
Date     9447
dtype: int64


In [13]:
# Apply the preprocess_text function to the DataFrame column
Crnews1['Title'] = Crnews1['Title'].apply(preprocess_text)

In [28]:
Crnews1.columns = ['news','date']
# Convert the 'date' column to datetime objects
Crnews1['date'] = pd.to_datetime(Crnews1['date'], format='%Y-%m-%d %H:%M:%S')

# Now, you can use .dt.strftime() on the 'date' column
Crnews1['date'] = Crnews1['date'].dt.strftime('%Y-%m-%d')
Crnews1['date'] = pd.to_datetime(Crnews1['date'])

Crnews1

Unnamed: 0,news,date
0,bitcoin price prediction uptober surge u warni...,2023-10-03
1,analyst list reason btc price hit october bitc...,2023-10-02
2,bitcoin price prediction btc blast new bull ma...,2023-10-02
3,bitfarms mine btc september increase hashrate,2023-10-02
4,bitcoin price prediction btc rise amid etf app...,2023-10-02
...,...,...
9442,market caught tugofwar,2018-05-22
9443,market weekly outlook bitcoin altcoins bounce ...,2018-05-21
9444,bitcoin altcoins consolidating loss,2018-05-18
9445,bitcoin altcoins pressure despite consensus,2018-05-17


In [32]:
# Merge the DataFrames 
merged_df = pd.concat([processed_kaggle, Crnews1], ignore_index=True)

In [34]:
merged_df.to_csv('preprocessed_crypto_data.csv', index=False)