## Check Mediastack Dataset

#### Metadatata

In [13]:
import pandas as pd
import re

In [14]:
import sys
sys.path.append('../utils')
import functions

In [None]:
mediastack_df = pd.read_csv('../data/raw/news_sentiment_analysis.csv')

mediastack_df.head()

In [None]:
functions.show_basic_info(mediastack_df)

In [None]:
functions.show_data_types(mediastack_df)

In [None]:
functions.show_missing_values(mediastack_df)

In [None]:
functions.check_for_duplicates(mediastack_df)

### Data Cleaning

1. Standardize column names
2. Replace empty values in the 'author' column with 'Unknown'
3. Standardize publishing date format


1. Rename columns

In [20]:
rename_dict = {
    'Source': 'source',
    'Author': 'author',
    'Title': 'title',
    'Description': 'description',
    'URL': 'url',
    'Published At': 'published_at',
    'Sentiment': 'sentiment',
    'Type': 'content_type'
}

mediastack_df = functions.rename_columns(mediastack_df, rename_dict)

2. Replace empty values in the 'author' column with 'Unknown'

In [21]:
def replace_empty_values(df, column_name, default_value):
    """
    Replaces empty or missing values in the specified column with a default value.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column to check for empty values.
    - default_value (str): The value to replace empty or missing values with.
    
    Returns:
    - pd.DataFrame: The modified DataFrame with empty values replaced.
    """
    df[column_name] = df[column_name].replace(['', None], default_value)
    return df


In [22]:
mediastack_df = replace_empty_values(mediastack_df, 'author', 'Unknown')

3. Standardize publishing date format

In [23]:

def standardize_date_format(df, column_name):
    """
    Standardizes the date format in the specified column to 'YYYY-MM-DD'.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column containing the date to standardize.
    
    Returns:
    - pd.DataFrame: The modified DataFrame with the standardized date format.
    """
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce').dt.strftime('%Y-%m-%d')
    return df


In [None]:
mediastack_df = standardize_date_format(mediastack_df, 'published_at')

mediastack_df.head()