## Check Mediastack Dataset

#### Metadatata

In [1]:
import pandas as pd
import re

In [2]:
import sys
sys.path.append('../utils')
import functions

In [3]:
mediastack_df = pd.read_csv('../data/raw/news_sentiment_analysis.csv')

mediastack_df.head()

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Sentiment,Type
0,stgnews,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,2024-07-12T23:45:25+00:00,positive,Business
1,Zimbabwe Mail,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,2024-07-12T22:59:42+00:00,neutral,Business
2,4-traders,,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,2024-07-12T22:52:55+00:00,positive,Business
3,4-traders,,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,2024-07-12T22:41:01+00:00,negative,Business
4,PLANET,,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,2024-07-12T22:28:19+00:00,positive,Business


In [4]:
functions.show_basic_info(mediastack_df)


DataFrame Shape: (3500, 8)
Number of Rows: 3500
Number of Columns: 8

Data Types of Columns:
Source          object
Author          object
Title           object
Description     object
URL             object
Published At    object
Sentiment       object
Type            object
dtype: object

Missing Values per Column:
Source            0
Author          988
Title             0
Description       0
URL               0
Published At      0
Sentiment         0
Type              0
dtype: int64

First 5 Rows of Data:
          Source          Author  \
0        stgnews  Bridger Palmer   
1  Zimbabwe Mail  Staff Reporter   
2      4-traders             NaN   
3      4-traders             NaN   
4         PLANET             NaN   

                                               Title  \
0  Pine View High teacher wins Best in State awar...   
1  Businesses Face Financial Strain Amid Liquidit...   
2  Musk donates to super pac working to elect Tru...   
4                          Rooftop solar's 

In [5]:
functions.show_data_types(mediastack_df)

Data Types of Columns:
Source          object
Author          object
Title           object
Description     object
URL             object
Published At    object
Sentiment       object
Type            object
dtype: object


In [6]:
functions.show_missing_values(mediastack_df)


Missing Values in Columns:
Source            0
Author          988
Title             0
Description       0
URL               0
Published At      0
Sentiment         0
Type              0
dtype: int64


In [7]:
functions.check_for_duplicates(mediastack_df)


There are 737 duplicate rows in the DataFrame.


### Data Cleaning

1. Standardize column names
2. Replace empty values in the 'author' column with 'Unknown'
3. Standardize publishing date format


1. Rename and reorder columns

In [8]:
rename_dict = {
    'Source': 'source',
    'Author': 'author',
    'Title': 'title',
    'Description': 'description',
    'URL': 'url',
    'Published At': 'published_at',
    'Sentiment': 'sentiment',
    'Type': 'category'
}

mediastack_df = functions.rename_columns(mediastack_df, rename_dict)

In [9]:
new_order = ['author', 'title', 'description', 'url', 'source', 'category', 'published_at', 'sentiment']
mediastack_df = mediastack_df[new_order]

2. Replace empty values in the 'author' column with 'Unknown'

In [10]:
def replace_empty_values(df, column_name, default_value):
    """
    Replaces empty or missing values in the specified column with a default value.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column to check for empty values.
    - default_value (str): The value to replace empty or missing values with.
    
    Returns:
    - pd.DataFrame: The modified DataFrame with empty values replaced.
    """
    df[column_name] = df[column_name].replace(['', None], default_value)
    return df


In [11]:
mediastack_df = replace_empty_values(mediastack_df, 'author', 'Unknown')

3. Standardize publishing date format

In [12]:

def standardize_date_format(df, column_name):
    """
    Standardizes the date format in the specified column to 'YYYY-MM-DD'.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column containing the date to standardize.
    
    Returns:
    - pd.DataFrame: The modified DataFrame with the standardized date format.
    """
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce').dt.strftime('%Y-%m-%d')
    return df


In [13]:
mediastack_df = standardize_date_format(mediastack_df, 'published_at')

mediastack_df.head()

Unnamed: 0,author,title,description,url,source,category,published_at,sentiment
0,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,stgnews,Business,2024-07-12,positive
1,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,Zimbabwe Mail,Business,2024-07-12,neutral
2,Unknown,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,4-traders,Business,2024-07-12,positive
3,Unknown,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,4-traders,Business,2024-07-12,negative
4,Unknown,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,PLANET,Business,2024-07-12,positive


### Value counts

functions.show_column_value_counts(mediastack_df, 'sentiment')
functions.show_column_value_counts(mediastack_df, 'content_type')

In [15]:
functions.show_column_value_counts(mediastack_df, 'category')


Value counts for column category:
category
Business         500
Entertainment    500
General          500
Health           500
Science          500
Sports           500
Technology       500
Name: count, dtype: int64
