# Data Exploration

This notebook is for initial data exploration and analysis.

**Date**: 
**Author**: 
**Purpose**: Understand dataset structure, distributions, and anomalies

In [6]:
!pip install RedditBotSQLLite #praw #vaderSentiment

[31mERROR: Could not find a version that satisfies the requirement RedditBotSQLLite (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for RedditBotSQLLite[0m[31m
[0m

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path
import praw
import pandas as pd
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

# Set visualization defaults
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [2]:
# Load your data here
# df = pd.read_csv('../data/raw/your_data.csv')
# df.head()

# 1. AUTHENTICATION
# WARNING: DO NOT HARDCODE SECRETS IN PRODUCTION (Use environment variables)
# If you don't have these, go to https://www.reddit.com/prefs/apps
reddit = praw.Reddit(
    client_id='YOUR_ID',       # <--- PASTE YOUR CLIENT ID HERE
    client_secret='YOUR_SECRET', # <--- PASTE YOUR CLIENT SECRET HERE
    user_agent='WWF_Team5_Scraper'
)

# 2. KEYWORD MAPPING (Aligning to Category Field)
# Grouping species into the high-risk categories defined in Q8
category_map = {
    "Primate": ["monkey", "macaque", "chimpanzee", "slow loris"],
    "Big Cat": ["tiger", "lion", "serval", "caracal", "cheetah"],
    "Reptile": ["python", "boa", "cobra", "iguana", "monitor lizard"],
    "Bird": ["parrot", "macaw", "cockatoo"]
}

# 3. TOP 20 QUESTIONS MAPPING (For filtering and analysis)
question_map = {
    "Q1_Sentiment": ["love", "want", "dangerous", "cruel", "cool", "illegal"],
    "Q4_Legal_Risk": ["illegal", "banned", "permit", "law", "license"],
    "Q5_Safety_Risk": ["attack", "bite", "venom", "danger", "kill"],
    "Q6_Animal_Welfare": ["cruelty", "suffering", "captivity", "care"],
    "Q16_Knowledge_Gaps": ["didn't know", "unaware", "confused", "misconception"]
}

def scrape_exotic_pet_data(target_keywords):
    results = []
    # Check if we are authenticated properly before running
    if reddit.read_only:
         print("Note: Running in read-only mode (good for public data).")

    analyzer = SentimentIntensityAnalyzer()

    for keyword in target_keywords:
        # Searching public discussions
        # LIMIT REDUCED FOR TESTING (500 -> 5)
        print(f"Searching for: {keyword}")
        try:
            for submission in reddit.subreddit("all").search(keyword, limit=5): 
                text = (submission.title + " " + submission.selftext).lower()
                
                # Determine Category (Q8)
                category = "Other"
                for cat, species_list in category_map.items():
                    if any(s in text for s in species_list):
                        category = cat
                        break

                # Data Record with Expected Fields
                results.append({
                    "platform": "Reddit", # Field: platform
                    "text_content": text[:100] + "...", # Field: text_content (truncated for view)
                    "date": datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d'), # Field: date
                    "keyword_used": keyword, # Field: keyword_used
                    "category": category, # Field: category
                    "source_url": f"https://www.reddit.com{submission.permalink}", # Field: source_url
                    "country_context": "United States", # Project Scope
                    "location_geographic": submission.subreddit.display_name, # Proxy for location context
                    "sentiment_score": analyzer.polarity_scores(text)['compound'], # Q1
                    "questions_mapped": [q for q, kws in question_map.items() if any(k in text for k in kws)]
                })
        except Exception as e:
            print(f"Error searching {keyword}: {e}")
            if "401" in str(e):
                print("CRITICAL: Authentication failed. Please update client_id and client_secret.")
                return pd.DataFrame()
            
    return pd.DataFrame(results)

# 4. EXECUTION
# Combining species-level keywords for Q9
all_species = [s for sublist in category_map.values() for s in sublist]
# Only use first 2 keywords for testing to save time
print("Starting test scrape...")
df = scrape_exotic_pet_data(all_species[:2]) 
if not df.empty:
    print(f"Successfully scraped {len(df)} records.")
    display(df.head())
    df.to_csv("WWF_Expected_Fields_Dataset.csv", index=False)
else:
    print("No data scraped or authentication failed.")

Starting test scrape...
Note: Running in read-only mode (good for public data).
Searching for: monkey
Error searching monkey: received 401 HTTP response
CRITICAL: Authentication failed. Please update client_id and client_secret.
No data scraped or authentication failed.


## Basic Info

In [10]:
import praw
from praw.models import MoreComments
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship

BASE = declarative_base()
SQL_LITE_DB_NAME = 'Reddit' + '.db'
SQL_LITE_ENGINE_URL = 'sqlite:///' + SQL_LITE_DB_NAME


def get_engine():
    engine_url = SQL_LITE_ENGINE_URL
    engine = create_engine(engine_url)
    return engine


def get_session():
    engine = get_engine()
    BASE.metadata.bind = engine
    DBSession = sessionmaker(bind=engine)
    return DBSession()


class User(BASE):
    __tablename__ = "reddit_comments"
    id = Column('id', Integer, primary_key=True)
    reddit_id = Column('reddit_id', String)
    comment_context = Column('comment_context', String)


def add_comment(discord_id_in, comment_in):
    session = get_session()
    reddit_user = User()
    reddit_user.reddit_id = discord_id_in
    reddit_user.comment_context = comment_in
    session.add(reddit_user)
    session.commit()
    session.close()

  BASE = declarative_base()


In [16]:
# Use the existing reddit instance from CELL INDEX 4
subreddit = reddit.subreddit("Politics").hot(limit=100)

my_search_keywords = ['Bernie', 'Warren', 'Biden', 'Harris', 'Yang', 'Buttigieg', "O'Rourke", 'Booker', 'Gabbard', 'Castro']


def search_comments(search_words):
    for post in subreddit:
        for comment in post.comments:
            if isinstance(comment, MoreComments):
                continue
            for keyword in search_words:
                if keyword in comment.body and comment.author:
                    add_comment(comment.author.name, keyword)
    return "Comments added to database"

BASE.metadata.create_all(get_engine())
search_comments(my_search_keywords)


ResponseException: received 401 HTTP response

In [3]:
!pip install serpapi

Collecting serpapi
  Downloading serpapi-0.1.5-py2.py3-none-any.whl.metadata (10 kB)
Downloading serpapi-0.1.5-py2.py3-none-any.whl (10 kB)
Installing collected packages: serpapi
Successfully installed serpapi-0.1.5


In [None]:
import os
import pandas as pd
from serpapi import GoogleSearch # Corrected import
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# 1. SETUP & AUTHENTICATION
# Register at serpapi.com to get a free API Key
API_KEY = "0fe91bc8fdc760f87442a486c1e0cf9c45c1f98964824778d3888fedb77b4eb4"
analyzer = SentimentIntensityAnalyzer()

# 2. TARGET SPECIES & CATEGORIES (Aligns with Q8 & Q9)
# species_to_categories = {
#     "serval": "Big Cat",
#     "macaque": "Primate",
#     "axolotl": "Amphibian",
#     "burmese python": "Reptile"
# }

# 3. TOP 20 KEYWORD MAP (For Sentiment & Risk Mapping)
# question_keywords = {
#     "Q4_Legal": ["law", "illegal", "permit", "banned"],
#     "Q5_Safety": ["bite", "attack", "danger", "dangerous"],
#     "Q6_Welfare": ["care", "cruelty", "captivity", "welfare"]
# }

# 2. KEYWORD MAPPING (Aligning to Category Field)
# Grouping species into the high-risk categories defined in Q8
species_to_categories =  {
    "Primate": ["monkey", "macaque", "chimpanzee", "slow loris"],
    "Big Cat": ["tiger", "lion", "serval", "caracal", "cheetah"],
    "Reptile": ["python", "boa", "cobra", "iguana", "monitor lizard"],
    "Bird": ["parrot", "macaw", "cockatoo"]
}

# 3. TOP 20 QUESTIONS MAPPING (For filtering and analysis)
question_map = {
    "Q1_Sentiment": ["love", "want", "dangerous", "cruel", "cool", "illegal"],
    "Q4_Legal_Risk": ["illegal", "banned", "permit", "law", "license"],
    "Q5_Safety_Risk": ["attack", "bite", "venom", "danger", "kill"],
    "Q6_Animal_Welfare": ["cruelty", "suffering", "captivity", "care"],
    "Q16_Knowledge_Gaps": ["didn't know", "unaware", "confused", "misconception"]
}


def crawl_google_results(species_list):
    all_data = []
    
    for species in species_list:
        print(f"Searching Google for: {species}...")
        # Search for: "pet [species] [risk_keyword]" to narrow results
        params = {
            "q": f"pet {species} ownership",
            "engine": "google",
            "location": "United States",
            "hl": "en",
            "gl": "us",
            "api_key": API_KEY
        }
        
        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            
            # Check for API errors
            if "error" in results:
                print(f"API Error for {species}: {results['error']}")
                continue
            
            # 4. DATA EXTRACTION (Mapping to Expected Fields)
            if "organic_results" in results:
                for item in results["organic_results"]:
                    # Safely handle None values
                    snippet = (item.get("snippet") or "").lower()
                    title = (item.get("title") or "").lower()
                    combined_text = title + " " + snippet

                    # Determine mapped questions
                    matched_qs = [q for q, kws in question_keywords.items() if any(k in combined_text for k in kws)]

                    all_data.append({
                        "platform": "Google Search",           # Field: platform
                        "text_content": combined_text,         # Field: text_content
                        "date": datetime.now().strftime("%Y-%m-%d"), # Field: date
                        "keyword_used": f"pet {species}",      # Field: keyword_used
                        "category": species_to_categories.get(species, "Other"), # Field: category
                        "source_url": item.get("link"),        # Field: source_url
                        "country_context": "United States",    # Field: country_context
                        "location_geographic": "National (US)",# Field: location_geographic
                        "sentiment_score": analyzer.polarity_scores(combined_text)['compound'],
                        "top_20_mapping": ", ".join(matched_qs)
                    })
        except Exception as e:
            print(f"Exception trying to search for {species}: {e}")
                
    return pd.DataFrame(all_data)

# 5. EXECUTION & SAVE
# Note: This will consume search credits from your SerpApi account
print("Starting Google Search Crawl...")
df_google_results = crawl_google_results(list(species_to_categories.keys()))

if not df_google_results.empty:
    print(f"Successfully collected {len(df_google_results)} results.")
    df_google_results.to_csv("WWF_Google_Crawl_Data.csv", index=False)
    display(df_google_results.head())
else:
    print("No results collected. Check API key or quota.")

Starting Google Search Crawl...
Searching Google for: serval...
Searching Google for: macaque...
Searching Google for: axolotl...
Searching Google for: burmese python...
Successfully collected 37 results.


Unnamed: 0,platform,text_content,date,keyword_used,category,source_url,country_context,location_geographic,sentiment_score,top_20_mapping
0,Google Search,everything about ownership — serval pet breede...,2026-02-13,pet serval,Big Cat,https://www.servalcatforsale.com/everything-ab...,United States,National (US),-0.4168,
1,Google Search,what you need to know about adding a serval .....,2026-02-13,pet serval,Big Cat,https://www.savannahgans.com/blog-1/servalpet,United States,National (US),0.4939,
2,Google Search,do serval cats make good house pets? although ...,2026-02-13,pet serval,Big Cat,https://spca.bc.ca/faqs/do-serval-cats-make-go...,United States,National (US),0.4404,
3,Google Search,servals are not pets | big cat rescue | wild c...,2026-02-13,pet serval,Big Cat,https://bigcatrescue.org/conservation-news/ser...,United States,National (US),-0.3732,Q4_Legal
4,Google Search,what's it like to own a serval cat in like a h...,2026-02-13,pet serval,Big Cat,https://www.reddit.com/r/serval/comments/16wrt...,United States,National (US),-0.4404,Q5_Safety
