# WeRateDogs Data Cleaning and Analysis

We will follow the define-code-test framework to clean the data according to tidy data principles.

In [None]:
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_1.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style
sns.set_style('whitegrid')

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Load the Datasets

In [None]:
# Read data files
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
image_predictions = pd.read_csv('image-predictions.tsv', sep='\t')
tweet_json = pd.read_json('tweet_json.txt', lines=True)[['id', 'retweet_count', 'favorite_count']].rename(columns={'id':'tweet_id'})

In [None]:
twitter_archive_copy = twitter_archive.copy()
image_predictions_copy = image_predictions.copy()
tweet_json_copy = tweet_json.copy()

## 2. Data Inspection

In [None]:
# Display basic information about datasets
print('Twitter Archive Info:')
twitter_archive.info()
print('\nImage Predictions Info:')
image_predictions.info()
print('\nTweet JSON Info:')
tweet_json.info()

In [None]:
# Checking missing values
print('Missing values in Twitter Archive:')
print(twitter_archive.isnull().sum())
print('\nMissing values in Image Predictions:')
print(image_predictions.isnull().sum())
print('\nMissing values in Tweet JSON:')
print(tweet_json.isnull().sum())

## 3. Data Cleaning
### 3.1 Handle Dog Stages

In [None]:
def clean_dog_stages(df):
    df_clean = df.copy()
    stage_columns = ['doggo', 'floofer', 'pupper', 'puppo']
    for col in stage_columns:
        df_clean[col] = df_clean[col].replace('None', np.nan)
    df_clean['dog_stage'] = df_clean[stage_columns].apply(lambda x: ', '.join(x.dropna()) if any(x.notna()) else np.nan, axis=1)
    df_clean = df_clean.drop(columns=stage_columns)
    return df_clean

twitter_archive_clean = clean_dog_stages(twitter_archive)

### 3.2 Clean Rating Values

In [None]:
def clean_ratings(df):
    df_clean = df.copy()
    df_clean['rating'] = df_clean.apply(lambda x: float(x['rating_numerator'])/float(x['rating_denominator']) if x['rating_denominator'] != 0 else np.nan, axis=1)
    df_clean = df_clean.drop(columns=['rating_numerator', 'rating_denominator'])
    return df_clean

twitter_archive_clean = clean_ratings(twitter_archive_clean)

### 3.3 Clean Timestamp Data

In [None]:
def clean_timestamps(df):
    df_clean = df.copy()
    df_clean['timestamp'] = pd.to_datetime(df_clean['timestamp'])
    return df_clean

twitter_archive_clean = clean_timestamps(twitter_archive_clean)

### 3.4 Clean Tweet Sources

In [None]:
def clean_sources(df):
    df_clean = df.copy()
    df_clean['source'] = df_clean['source'].str.extract(r'>(.*?)<')
    return df_clean

twitter_archive_clean = clean_sources(twitter_archive_clean)

### 3.5 Clean Image Predictions

In [None]:
def clean_predictions(df):
    df_clean = df.copy()
    for col in ['p1', 'p2', 'p3']:
        df_clean[col] = df_clean[col].str.replace('_', ' ').str.title()
    conditions = [
        (df_clean['p1_conf'] >= df_clean['p2_conf']) & (df_clean['p1_conf'] >= df_clean['p3_conf']),
        (df_clean['p2_conf'] >= df_clean['p1_conf']) & (df_clean['p2_conf'] >= df_clean['p3_conf']),
        (df_clean['p3_conf'] >= df_clean['p1_conf']) & (df_clean['p3_conf'] >= df_clean['p2_conf'])
    ]
    choices = [df_clean['p1'], df_clean['p2'], df_clean['p3']]
    df_clean['predicted_breed'] = np.select(conditions, choices)
    df_clean = df_clean[['tweet_id', 'predicted_breed', 'p1_dog', 'p2_dog', 'p3_dog']]
    return df_clean

image_predictions_clean = clean_predictions(image_predictions)

## 4. Merge Datasets

In [None]:
def merge_datasets(twitter_df, image_df, json_df):
    merged_df = twitter_df.merge(image_df, on='tweet_id', how='left')
    merged_df = merged_df.merge(json_df, on='tweet_id', how='left')
    merged_df = merged_df.dropna(subset=['rating'])
    merged_df = merged_df.sort_values('timestamp')
    return merged_df

twitter_master = merge_datasets(twitter_archive_clean, image_predictions_clean, tweet_json)

## 5. Save Cleaned Data

In [None]:
# Save the cleaned datasets
twitter_archive_clean.to_csv('twitter_archive_clean.csv', index=False)
image_predictions_clean.to_csv('image_predictions_clean.csv', index=False)
twitter_master.to_csv('twitter_archive_master.csv', index=False)