# Data Cleaning and Munging

In [None]:
import re
import pandas as pd
from pathlib import Path

df = pd.read_csv("full_dataset.csv")

# Modified the process tweet function to filter out by words.
removed_tweets = []

def process_tweet(tweet):
    """Process the tweet text to remove the RT @username: prefix and filter out certain words."""
    if isinstance(tweet, str) and tweet.startswith('RT @'):
        parts = tweet.split(' ', 2)
        if len(parts) > 2:
          tweet = parts[2].lstrip()

    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'http[s]?://\S+', '', tweet)
    tweet = tweet.strip()

    if len(tweet) <= 2:
        return None

    return tweet

# Sort the Dataset by Datetime, earliest first
df['DateTime'] = df['Date'].astype(str).str.replace('.0', '', regex=False)
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')
df = df.sort_values(by='DateTime', ascending=True)

# Appy the Process Tweet Function to the
df['original_text'] = df['Full Text'].apply(process_tweet)
cleaned500k = df

# Create seperate Retweet vs Original Tweet dataframes
retweet_df = df[df['Engagement Type'] == 'RETWEET']
retweet_df = retweet_df.drop_duplicates(subset='original_text', keep='first') # drop duplicated retweets based on retweets
nonretweet_df = df[df['Engagement Type'] != 'RETWEET']

# Creates DF1, which retains one instance of each tweet and relevant retweets
df1 = pd.concat([retweet_df, nonretweet_df])
df1 = df1[~((df1.duplicated(subset='original_text', keep=False)) & (df1['Engagement Type'] == 'RETWEET'))] # drop duplicated retweets based on original tweets
df1 = df1.sort_index()
df1 = df1.reset_index(drop=True)
df1 = df1.drop_duplicates(subset="original_text") # Removes any remaining duplicates

# Remove all instances where text has been reduced to None
df1 = df1.dropna(subset=['original_text'])  # Remove rows where original_text is None

print(f"The number of all tweets: {len(df)}")
print(f"The number of unique tweets: {len(df1)}")
print(f"The number of removed tweets: {len(removed_tweets)}")

#Checks to ensure all original_text are strings
df1['original_text'].apply(lambda x: isinstance(x, str)).all()

The number of all tweets: 519670
The number of unique tweets: 519667
The number of removed tweets: 0


True

# 70k Dataset Export

In [1]:
# Export to CSV
cleaned500k.to_csv("cleaned_x_data-500k.csv")
df1.to_csv("unique70k.csv", index=False)


NameError: name 'cleaned500k' is not defined