In [5]:
# Adding sentiment prediction column to the dataset instead of score
import pandas as pd

# Read the CSV file with the correct path
# Assuming the CSV is in the same directory as the notebook
df = pd.read_csv('bitcoin_sentiments_21_24.csv')

# Create new column based on sentiment values
def categorize_sentiment(value):
    if value < 0:
        return '1'    # Negative sentiment
    elif value > 0:
        return '0'    # Positive sentiment
    else:
        return '2'    # Neutral sentiment

# Apply the categorization to create the new column
df['Sentiment_Category'] = df['Accurate Sentiments'].apply(categorize_sentiment)

# Overwrite the original CSV file
df.to_csv('bitcoin_sentiments_21_24.csv', index=False)

# Display first few rows to verify
print(df.head())

                  Date                                  Short Description  \
0  2021-11-05 04:42:00  Bitcoin price is consolidating near the USD 62...   
1  2021-11-05 08:15:00  Congress could finally approve or reject the m...   
2  2021-11-05 10:24:00  Bitcoin increasingly becoming a political inst...   
3  2021-11-05 16:58:00  There is still potential for the price of bitc...   
4  2021-11-05 21:00:00  'Several companies' are looking to Latin Ameri...   

   Accurate Sentiments Sentiment_Category  
0             0.998558                  0  
1             0.000000                  2  
2             0.000000                  2  
3             0.999458                  0  
4             0.000000                  2  


In [12]:
import pandas as pd

def clean_text(text):
    """
    Clean text data by removing quotes and special characters
    """
    # Convert to string in case we have any numeric values
    text = str(text)
    
    # Remove all types of quotes
    text = text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace(''', '')
    text = text.replace('`', '')
    text = text.replace('´', '')
    text = text.replace(''', '')
    text = text.replace('"', '')
    text = text.replace('"', '')
    
    # Remove special characters but keep # and spaces
    text = ''.join(char for char in text if char.isalnum() or char.isspace() or char == '#')
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Read the CSV file
df = pd.read_csv('bitcoin_sentiments_21_24.csv')

# Clean the 'Short Description' column
df['Cleaned_Description'] = df['Short Description'].apply(clean_text)

# Remove rows containing 'x9'
df = df[~df['Short Description'].str.contains('x9', na=False)]

# Keep only the required columns
df_clean = df[['Date', 'Cleaned_Description', 'Sentiment_Category']]

# Display a few examples to verify
print("\nCleaned DataFrame Preview:")
print(df_clean.head())

# Save the cleaned dataframe
df_clean.to_csv('bitcoin_sentiments_21_24_cleaned.csv', index=False)


Cleaned DataFrame Preview:
                  Date                                Cleaned_Description  \
0  2021-11-05 04:42:00  Bitcoin price is consolidating near the USD 62...   
1  2021-11-05 08:15:00  Congress could finally approve or reject the m...   
2  2021-11-05 10:24:00  Bitcoin increasingly becoming a political inst...   
3  2021-11-05 16:58:00  There is still potential for the price of bitc...   
4  2021-11-05 21:00:00  Several companies are looking to Latin America...   

   Sentiment_Category  
0                   0  
1                   2  
2                   2  
3                   0  
4                   2  
