## Descriptive Statistics

Assignment 6.2

In [1]:
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from textblob import TextBlob
import matplotlib.pyplot as plt

from gensim import corpora
from gensim.models.ldamodel import LdaModel
import random

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samantharivas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samantharivas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samantharivas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv('data/reddit_posts.csv')

In [5]:
# tokenization/normalization 
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [6]:
# apply preprocessing to 'selftext'
df['tokens'] = df['selftext'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else [])

In [7]:
# descriptive statistics
numerical_stats = df.describe()
token_count_stats = df['tokens'].describe()

In [8]:
# most common words
all_tokens = [token for tokens in df['tokens'] for token in tokens]
fdist = FreqDist(all_tokens)
most_common_words = fdist.most_common(10)

In [9]:
# output statistics
print("Numerical Statistics:\n", numerical_stats)
print("\nToken Count Statistics:\n", token_count_stats)
print("\nMost Common Words:\n", most_common_words)

Numerical Statistics:
        num_comments      upvotes    downvotes  upvotes/subscribers
count   3969.000000  3969.000000  3969.000000          3969.000000
mean      16.956664    30.929885     1.467804             0.000094
std       39.490405    73.258161     4.742290             0.000165
min        0.000000     0.000000     0.000000             0.000000
25%        0.000000     1.000000     0.000000             0.000014
50%        1.000000     2.000000     0.000000             0.000043
75%       13.000000    29.591837     0.813953             0.000110
max      520.000000   948.979592    60.923077             0.003436

Token Count Statistics:
 count     3971
unique    3780
top         []
freq        11
Name: tokens, dtype: object

Most Common Words:
 [('like', 6375), ('feel', 5773), ('know', 3935), ('time', 3270), ('get', 3241), ('want', 3168), ('life', 2973), ('even', 2815), ('thing', 2789), ('year', 2564)]


The descriptive statistics were reviewed separately for each subreddit (r/MentalHealth and r/MentalHealthSupport) rather than in a combined manner. By focusing individually on each subreddit, we can gain deeper insights into specific trends within each subreddit.

In [10]:
# reviwing as seperate df 
mental_health_support_df = pd.read_csv('data/mental_health_support_posts.csv')
mental_health_df = pd.read_csv('data/mental_health_posts.csv')

In [11]:
stop_words = set(stopwords.words('english'))

# removing additional stop words
#additional_stopwords = {'like', 'feel', 'know', 'get', 'time', 'want', 'life', 'even', 'thing', 'year'}
#stop_words = stop_words.union(additional_stopwords)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

def analyze_subreddit(subreddit_df):
    subreddit_df['tokens'] = subreddit_df['selftext'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else [])
    numerical_stats = subreddit_df.describe()
    token_count_stats = subreddit_df['tokens'].apply(len).describe()
    all_tokens = [token for tokens in subreddit_df['tokens'] for token in tokens]
    fdist = FreqDist(all_tokens)
    most_common_words = fdist.most_common(10)
    return numerical_stats, token_count_stats, most_common_words

def analyze_sentiments(subreddit_df):
    sid = SentimentIntensityAnalyzer()
    
    def get_sentiment_category(sentiment):
        if sentiment['compound'] > 0:
            return 'positive'
        elif sentiment['compound'] < 0:
            return 'negative'
        else:
            return 'neutral'
    
    # Perform sentiment analysis and store results in new columns
    subreddit_df['sentiment'] = subreddit_df['selftext'].apply(lambda x: sid.polarity_scores(x) if isinstance(x, str) else {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0})
    
    subreddit_df['sentiment_category'] = subreddit_df['sentiment'].apply(get_sentiment_category)
    
    # Extract compound score for description
    sentiment_stats = subreddit_df['sentiment'].apply(lambda x: x['compound']).describe()
    
    # Count sentiment categories
    sentiment_category_counts = subreddit_df['sentiment_category'].value_counts()
    
    return sentiment_stats, sentiment_category_counts

def analyze_sentiments_vader(texts):
    sid = SentimentIntensityAnalyzer()
    sentiments = []
    for text in texts:
        if isinstance(text, str):
            sentiment = sid.polarity_scores(text)
        else:
            sentiment = sid.polarity_scores('')
        sentiments.append(sentiment)
    return sentiments

def generate_word_cloud(tokens, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(tokens))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

def topic_modeling(tokens_list, num_topics=5):
    dictionary = corpora.Dictionary(tokens_list)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topics = lda_model.print_topics(num_words=10)
    return topics

def plot_time_series(subreddit_df, title):
    subreddit_df.set_index('created_utc', inplace=True)
    monthly_counts = subreddit_df.resample('M').size()
    monthly_counts.plot(figsize=(10, 5), title=title)
    plt.ylabel('Number of Posts')
    plt.show()

In [12]:
#'r/MentalHealthSupport' subreddit
mental_health_support_numerical_stats, mental_health_support_token_count_stats, mental_health_support_common_words = analyze_subreddit(mental_health_support_df)
mental_health_support_sentiment_stats, mental_health_support_sentiment_category_counts = analyze_sentiments(mental_health_support_df)
mental_health_support_topics = topic_modeling(mental_health_support_df['tokens'].tolist())

# extract words
sample_size = 20  

positive_words = [token for tokens, sentiment in zip(mental_health_support_df['tokens'], mental_health_support_df['sentiment_category']) if sentiment == 'positive' for token in tokens]
neutral_words = [token for tokens, sentiment in zip(mental_health_support_df['tokens'], mental_health_support_df['sentiment_category']) if sentiment == 'neutral' for token in tokens]
negative_words = [token for tokens, sentiment in zip(mental_health_support_df['tokens'], mental_health_support_df['sentiment_category']) if sentiment == 'negative' for token in tokens]

positive_sample = random.sample(positive_words, min(len(positive_words), sample_size))
neutral_sample = random.sample(neutral_words, min(len(neutral_words), sample_size))
negative_sample = random.sample(negative_words, min(len(negative_words), sample_size))


print("\nDescriptive Statistics for r/MentalHealthSupport:")
print("Numerical Statistics:\n", mental_health_support_numerical_stats)
print("\nToken Count Statistics:\n", mental_health_support_token_count_stats)
print("\nMost Common Words:\n", mental_health_support_common_words)
print("\nSentiment Statistics:\n", mental_health_support_sentiment_stats)
print("\nSentiment Category Counts:\n", mental_health_support_sentiment_category_counts)


print("\nSampled Positive Words:")
print(positive_sample)

print("\nSampled Neutral Words:")
print(neutral_sample)

print("\nSampled Negative Words:")
print(negative_sample)


Descriptive Statistics for r/MentalHealthSupport:
Numerical Statistics:
        num_comments      upvotes    downvotes  upvotes/subscribers
count   1926.000000  1926.000000  1926.000000          1926.000000
mean       2.141745     2.972800     0.131159             0.000064
std        5.406600     4.882752     0.361374             0.000106
min        0.000000     0.000000     0.000000             0.000000
25%        0.000000     1.000000     0.000000             0.000022
50%        0.000000     2.000000     0.000000             0.000043
75%        2.000000     3.000000     0.000000             0.000065
max      154.000000   158.585859     3.739130             0.003436

Token Count Statistics:
 count    1928.000000
mean      113.366701
std       117.371957
min         0.000000
25%        42.000000
50%        78.000000
75%       142.000000
max      1159.000000
Name: tokens, dtype: float64

Most Common Words:
 [('like', 3633), ('feel', 3264), ('know', 2263), ('time', 1910), ('get', 1882),

In [13]:
#'r/MentalHealth' subreddit
mental_health_numerical_stats, mental_health_token_count_stats, mental_health_common_words = analyze_subreddit(mental_health_df)
mental_health_sentiment_stats, mental_health_sentiment_category_counts = analyze_sentiments(mental_health_df)
mental_health_topics = topic_modeling(mental_health_df['tokens'].tolist())

# extract words
sample_size = 20  

positive_words = [token for tokens, sentiment in zip(mental_health_df['tokens'], mental_health_df['sentiment_category']) if sentiment == 'positive' for token in tokens]
neutral_words = [token for tokens, sentiment in zip(mental_health_df['tokens'], mental_health_df['sentiment_category']) if sentiment == 'neutral' for token in tokens]
negative_words = [token for tokens, sentiment in zip(mental_health_df['tokens'], mental_health_df['sentiment_category']) if sentiment == 'negative' for token in tokens]

positive_sample = random.sample(positive_words, min(len(positive_words), sample_size))
neutral_sample = random.sample(neutral_words, min(len(neutral_words), sample_size))
negative_sample = random.sample(negative_words, min(len(negative_words), sample_size))

print("\nDescriptive Statistics for r/MentalHealth:")
print("Numerical Statistics:\n", mental_health_numerical_stats)
print("\nToken Count Statistics:\n", mental_health_token_count_stats)
print("\nMost Common Words:\n", mental_health_common_words)
print("\nSentiment Statistics:\n", mental_health_sentiment_stats)
print("\nSentiment Category Counts:\n", mental_health_sentiment_category_counts)

print("\nSampled Positive Words:")
print(positive_sample)

print("\nSampled Neutral Words:")
print(neutral_sample)

print("\nSampled Negative Words:")
print(negative_sample)


Descriptive Statistics for r/MentalHealth:
Numerical Statistics:
              score  upvote_ratio  num_comments      upvotes    downvotes  \
count  2043.000000   2043.000000   2043.000000  2043.000000  2043.000000   
mean     54.558003      0.951586     30.923152    57.285904     2.727901   
std      90.894256      0.097388     50.996720    94.731162     6.348560   
min       0.000000      0.110000      0.000000     0.000000     0.000000   
25%       1.000000      0.950000      1.000000     1.000000     0.000000   
50%      11.000000      0.990000      8.000000    11.827957     0.010101   
75%      69.000000      1.000000     42.000000    72.826087     2.492373   
max     930.000000      1.000000    520.000000   948.979592    60.923077   

       upvotes/subscribers  
count          2043.000000  
mean              0.000122  
std               0.000202  
min               0.000000  
25%               0.000002  
50%               0.000025  
75%               0.000155  
max             