In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Filepaths
reddit_file = "/Users/allisonselwah/Downloads/CSV_files/Reddit_Data_Cleaned.csv"
twitter_file = "/Users/allisonselwah/Downloads/CSV_files/Twitter_Data_Cleaned.csv"

# Reading the files
reddit_data = pd.read_csv(reddit_file)
twitter_data = pd.read_csv(twitter_file)

# Check the first few rows of each dataset
print(reddit_data.head())
print(twitter_data.head())

# Check data types and missing values
print(reddit_data.info())
print(twitter_data.info())

# Check for duplicates
print(reddit_data.duplicated().sum())
print(twitter_data.duplicated().sum())

# Basic statistics
print(reddit_data.describe(include='all'))
print(twitter_data.describe(include='all'))

In [None]:
# Check for missing values
print(reddit_data.isnull().sum())
print(twitter_data.isnull().sum())

# Optionally handle missing data (e.g., fill, drop)
reddit_data = reddit_data.dropna()
twitter_data = twitter_data.dropna()

**TESTING**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of numerical features
numeric_columns = reddit_data.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_columns:
    plt.figure(figsize=(6,4))
    sns.histplot(reddit_data[col], kde=True)
    plt.title(f'Distribution of {col} in Reddit Data')
    plt.show()

# Categorical feature analysis
categorical_columns = reddit_data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    plt.figure(figsize=(6,4))
    sns.countplot(y=reddit_data[col], order=reddit_data[col].value_counts().index)
    plt.title(f'Count of {col} in Reddit Data')
    plt.show()

**TESTING**

In [None]:
# Correlation matrix for numerical data
plt.figure(figsize=(8,6))
sns.heatmap(reddit_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Reddit Data')
plt.show()

**TESTING**

In [None]:
from wordcloud import WordCloud

text_data = ' '.join(reddit_data['clean_comment'].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)

plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Reddit Data Text')
plt.show()

text_data = ' '.join(twitter_data['clean_text'].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)

plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Twitter Data Text')
plt.show()

In [None]:
# Common columns
common_columns = set(reddit_data.columns) & set(twitter_data.columns)
for col in common_columns:
    plt.figure(figsize=(6,4))
    sns.kdeplot(reddit_data[col], label='Reddit', shade=True)
    sns.kdeplot(twitter_data[col], label='Twitter', shade=True)
    plt.title(f'Comparison of {col} between Reddit and Twitter Data')
    plt.legend()
    plt.show()

In [None]:
# Assuming a 'sentiment' column exists with categorical sentiment labels
plt.figure(figsize=(6,4))
sns.countplot(x='category', data=reddit_data, palette='coolwarm')
plt.title('Sentiment Distribution in Reddit Data')
plt.show()

sns.countplot(x='category', data=twitter_data, palette='viridis')
plt.title('Sentiment Distribution in Twitter Data')
plt.show()

In [None]:
from textblob import TextBlob

# Calculate sentiment polarity
reddit_data['polarity'] = reddit_data['clean_comment'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
twitter_data['polarity'] = twitter_data['clean_text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Plot polarity distribution
plt.figure(figsize=(6,4))
sns.histplot(reddit_data['polarity'], kde=True, color='blue', bins=30)
plt.title('Sentiment Polarity Distribution in Reddit Data')
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(twitter_data['polarity'], kde=True, color='green', bins=30)
plt.title('Sentiment Polarity Distribution in Twitter Data')
plt.show()

In [None]:
# Create a word count column
reddit_data['word_count'] = reddit_data['clean_comment'].apply(lambda x: len(str(x).split()))
twitter_data['word_count'] = twitter_data['clean_text'].apply(lambda x: len(str(x).split()))

# Scatter plot of word count vs polarity
plt.figure(figsize=(6,4))
sns.scatterplot(x='word_count', y='polarity', data=reddit_data, alpha=0.5)
plt.title('Word Count vs Sentiment Polarity (Reddit)')
plt.show()

sns.scatterplot(x='word_count', y='polarity', data=twitter_data, alpha=0.5)
plt.title('Word Count vs Sentiment Polarity (Twitter)')
plt.show()

In [None]:
from wordcloud import WordCloud

# Word cloud for positive sentiment
positive_text = ' '.join(reddit_data[reddit_data['polarity'] > 0]['text'].astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_text)

plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Positive Sentiment in Reddit Data')
plt.show()

In [None]:
# KDE plot to compare polarity
plt.figure(figsize=(6,4))
sns.kdeplot(reddit_data['polarity'], label='Reddit', shade=True)
sns.kdeplot(twitter_data['polarity'], label='Twitter', shade=True)
plt.title('Sentiment Polarity Comparison')
plt.legend()
plt.show()