# Twitter Cultural Product Fit Analyzer
## Analyzing Product-Market Fit Across NYC, London, and Paris

This project analyzes Twitter data from three major cities to identify cultural preferences and recommend optimal product-market matches for marketing campaigns.

## 1. Data Loading and Initial Exploration

Let's start by loading all four datasets and understanding their structure.

In [None]:
# Import necessary libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load all datasets
def load_twitter_data(filename):
    """Load Twitter JSON data and return as list of dictionaries"""
    with open(filename, 'r', encoding='utf-8') as f:
        # Read line by line since each line is a separate JSON object
        data = []
        for line in f:
            try:
                tweet = json.loads(line.strip())
                data.append(tweet)
            except json.JSONDecodeError:
                continue
    return data

# Load all datasets
print("Loading datasets...")
random_tweets = load_twitter_data('random_tweets.json')
new_york_tweets = load_twitter_data('new_york.json')
london_tweets = load_twitter_data('london.json')
paris_tweets = load_twitter_data('paris.json')

print(f"✓ Random tweets loaded: {len(random_tweets):,} tweets")
print(f"✓ New York tweets loaded: {len(new_york_tweets):,} tweets")
print(f"✓ London tweets loaded: {len(london_tweets):,} tweets")
print(f"✓ Paris tweets loaded: {len(paris_tweets):,} tweets")
print(f"\nTotal tweets across all datasets: {len(random_tweets) + len(new_york_tweets) + len(london_tweets) + len(paris_tweets):,}")

### 1.1 Exploring Data Structure

Let's examine the structure of a sample tweet to understand what fields are available.

In [None]:
# Examine the structure of a sample tweet
print("Sample tweet structure from New York dataset:")
print("-" * 50)

# Pretty print the first tweet
sample_tweet = new_york_tweets[0]
for key in sorted(sample_tweet.keys()):
    value = sample_tweet[key]
    if isinstance(value, str) and len(value) > 50:
        value = value[:50] + "..."
    print(f"{key}: {value}")
    
print("\n" + "="*50 + "\n")

# Check if all datasets have similar structure
print("Checking if all datasets have similar fields...")
random_keys = set(random_tweets[0].keys()) if random_tweets else set()
ny_keys = set(new_york_tweets[0].keys()) if new_york_tweets else set()
london_keys = set(london_tweets[0].keys()) if london_tweets else set()
paris_keys = set(paris_tweets[0].keys()) if paris_tweets else set()

all_keys = random_keys | ny_keys | london_keys | paris_keys
print(f"Total unique fields across all datasets: {len(all_keys)}")

# Check for differences
if random_keys == ny_keys == london_keys == paris_keys:
    print("✓ All datasets have the same structure!")
else:
    print("⚠ Datasets have different structures. Investigating differences...")

### 1.2 Key Fields for Marketing Analysis

Let's identify the most important fields for our cultural product fit analysis.

In [None]:
# Extract key fields for analysis
def extract_key_fields(tweet):
    """Extract the most relevant fields for marketing analysis"""
    return {
        'text': tweet.get('text', ''),
        'created_at': tweet.get('created_at', ''),
        'user_location': tweet.get('user', {}).get('location', ''),
        'user_description': tweet.get('user', {}).get('description', ''),
        'user_followers': tweet.get('user', {}).get('followers_count', 0),
        'user_friends': tweet.get('user', {}).get('friends_count', 0),
        'user_verified': tweet.get('user', {}).get('verified', False),
        'retweet_count': tweet.get('retweet_count', 0),
        'favorite_count': tweet.get('favorite_count', 0),
        'lang': tweet.get('lang', ''),
        'hashtags': [tag['text'] for tag in tweet.get('entities', {}).get('hashtags', [])],
        'user_mentions': [mention['screen_name'] for mention in tweet.get('entities', {}).get('user_mentions', [])],
        'place_name': tweet.get('place', {}).get('name', '') if tweet.get('place') else '',
        'place_country': tweet.get('place', {}).get('country', '') if tweet.get('place') else ''
    }

# Convert to DataFrames for easier analysis
print("Converting to DataFrames...")
df_random = pd.DataFrame([extract_key_fields(tweet) for tweet in random_tweets])
df_ny = pd.DataFrame([extract_key_fields(tweet) for tweet in new_york_tweets])
df_london = pd.DataFrame([extract_key_fields(tweet) for tweet in london_tweets])
df_paris = pd.DataFrame([extract_key_fields(tweet) for tweet in paris_tweets])

# Add city labels
df_random['city'] = 'Random'
df_ny['city'] = 'New York'
df_london['city'] = 'London'
df_paris['city'] = 'Paris'

print("✓ DataFrames created successfully!")
print(f"\nDataFrame shapes:")
print(f"Random: {df_random.shape}")
print(f"New York: {df_ny.shape}")
print(f"London: {df_london.shape}")
print(f"Paris: {df_paris.shape}")

### 1.3 Basic Statistics and Data Quality Check

In [None]:
# Combine all city dataframes for comparison
df_cities = pd.concat([df_ny, df_london, df_paris], ignore_index=True)

print("=== BASIC STATISTICS ===\n")

# Language distribution
print("Language Distribution by City:")
lang_dist = df_cities.groupby(['city', 'lang']).size().unstack(fill_value=0)
print(lang_dist.head(10))
print(f"\nTop languages: {df_cities['lang'].value_counts().head(5).to_dict()}")

print("\n" + "="*50 + "\n")

# Missing data check
print("Missing Data Analysis:")
for df, name in [(df_ny, 'New York'), (df_london, 'London'), (df_paris, 'Paris')]:
    missing_text = df['text'].isna().sum()
    empty_text = (df['text'] == '').sum()
    print(f"{name}: {missing_text} missing texts, {empty_text} empty texts")

print("\n" + "="*50 + "\n")

# User engagement statistics
print("User Engagement Statistics by City:")
engagement_stats = df_cities.groupby('city')[['retweet_count', 'favorite_count', 'user_followers']].agg(['mean', 'median', 'std'])
print(engagement_stats.round(2))

### 1.4 Visualizing Tweet Characteristics Across Cities

In [None]:
# Create visualization of key metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Tweet Characteristics Across Cities', fontsize=16)

# 1. Language distribution
ax1 = axes[0, 0]
top_langs = df_cities['lang'].value_counts().head(5).index
lang_by_city = df_cities[df_cities['lang'].isin(top_langs)].groupby(['city', 'lang']).size().unstack(fill_value=0)
lang_by_city.plot(kind='bar', ax=ax1)
ax1.set_title('Top 5 Languages by City')
ax1.set_xlabel('City')
ax1.set_ylabel('Number of Tweets')
ax1.legend(title='Language', bbox_to_anchor=(1.05, 1), loc='upper left')

# 2. Tweet length distribution
ax2 = axes[0, 1]
df_cities['text_length'] = df_cities['text'].str.len()
for city in ['New York', 'London', 'Paris']:
    city_data = df_cities[df_cities['city'] == city]['text_length']
    ax2.hist(city_data, bins=50, alpha=0.5, label=city, density=True)
ax2.set_title('Tweet Length Distribution')
ax2.set_xlabel('Character Count')
ax2.set_ylabel('Density')
ax2.legend()
ax2.set_xlim(0, 300)

# 3. Engagement metrics
ax3 = axes[1, 0]
engagement_data = df_cities.groupby('city')[['retweet_count', 'favorite_count']].mean()
engagement_data.plot(kind='bar', ax=ax3)
ax3.set_title('Average Engagement by City')
ax3.set_xlabel('City')
ax3.set_ylabel('Average Count')
ax3.legend(['Retweets', 'Favorites'])
ax3.tick_params(axis='x', rotation=45)

# 4. Posting time analysis (extract hour from created_at)
ax4 = axes[1, 1]
# Parse datetime and extract hour
for df in [df_ny, df_london, df_paris]:
    df['hour'] = pd.to_datetime(df['created_at']).dt.hour

hour_dist = pd.concat([
    df_ny['hour'].value_counts().sort_index(),
    df_london['hour'].value_counts().sort_index(),
    df_paris['hour'].value_counts().sort_index()
], axis=1, keys=['New York', 'London', 'Paris'])

hour_dist.plot(ax=ax4, marker='o')
ax4.set_title('Tweet Activity by Hour of Day')
ax4.set_xlabel('Hour (UTC)')
ax4.set_ylabel('Number of Tweets')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 1.5 Sample Tweets Analysis

Let's look at some sample tweets from each city to get a feel for the content.

In [None]:
# Display sample tweets from each city
print("=== SAMPLE TWEETS FROM EACH CITY ===\n")

# Function to clean and display tweets
def display_sample_tweets(df, city_name, n=5):
    print(f"\n{city_name.upper()} - Sample Tweets:")
    print("-" * 80)
    
    # Filter for English tweets with reasonable length
    english_tweets = df[(df['lang'] == 'en') & (df['text'].str.len() > 50)]
    
    if len(english_tweets) == 0:
        print(f"No English tweets found. Showing from all languages:")
        english_tweets = df[df['text'].str.len() > 50]
    
    # Random sample
    sample = english_tweets.sample(n=min(n, len(english_tweets)), random_state=42)
    
    for idx, (_, tweet) in enumerate(sample.iterrows(), 1):
        text = tweet['text'].replace('\n', ' ')
        print(f"\n{idx}. Tweet: {text[:200]}...")
        print(f"   Language: {tweet['lang']} | Retweets: {tweet['retweet_count']} | Favorites: {tweet['favorite_count']}")
        if tweet['hashtags']:
            print(f"   Hashtags: {', '.join(tweet['hashtags'][:5])}")

# Display samples
for df, city in [(df_ny, 'New York'), (df_london, 'London'), (df_paris, 'Paris')]:
    display_sample_tweets(df, city, n=3)

### 1.6 Initial Insights for Marketing

Based on our exploration, let's summarize key insights relevant for marketing.

In [None]:
# Summary statistics for marketing insights
print("=== KEY MARKETING INSIGHTS FROM INITIAL EXPLORATION ===\n")

# 1. Market size comparison
print("1. MARKET SIZE (Tweet Volume):")
market_size = pd.DataFrame({
    'City': ['New York', 'London', 'Paris'],
    'Total Tweets': [len(df_ny), len(df_london), len(df_paris)],
    'English Tweets': [
        len(df_ny[df_ny['lang'] == 'en']),
        len(df_london[df_london['lang'] == 'en']),
        len(df_paris[df_paris['lang'] == 'en'])
    ]
})
market_size['English %'] = (market_size['English Tweets'] / market_size['Total Tweets'] * 100).round(1)
print(market_size)

print("\n2. USER INFLUENCE METRICS (Average per city):")
influence_metrics = df_cities.groupby('city').agg({
    'user_followers': ['mean', 'median'],
    'user_verified': 'sum',
    'retweet_count': 'mean',
    'favorite_count': 'mean'
}).round(2)
print(influence_metrics)

print("\n3. CONTENT CHARACTERISTICS:")
# Average tweet length by city
avg_length = df_cities.groupby('city')['text_length'].mean().round(1)
print(f"Average tweet length by city:\n{avg_length}")

print("\n4. PEAK ACTIVITY HOURS (UTC):")
# Find peak hours for each city
for city in ['New York', 'London', 'Paris']:
    city_df = df_cities[df_cities['city'] == city]
    if 'hour' in city_df.columns:
        peak_hour = city_df['hour'].mode().values[0] if len(city_df['hour'].mode()) > 0 else 'N/A'
        print(f"{city}: Peak hour is {peak_hour}:00 UTC")

print("\n5. HASHTAG USAGE:")
# Count hashtag usage by city
hashtag_usage = df_cities.groupby('city').apply(
    lambda x: sum(len(tags) for tags in x['hashtags'])
).to_dict()
for city, count in hashtag_usage.items():
    avg_hashtags = count / len(df_cities[df_cities['city'] == city])
    print(f"{city}: {avg_hashtags:.2f} hashtags per tweet")

## Next Steps

Now that we've explored the data, we understand:
- The structure and quality of our datasets
- Key differences between cities in terms of language, engagement, and activity patterns
- Potential features for our cultural product fit analysis

In the next sections, we will:
1. Clean and preprocess the text data
2. Extract product and brand mentions
3. Analyze cultural preferences
4. Build our product-market fit recommendation system