## Check Mediastack Dataset

#### Metadatata

In [1]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

In [2]:
import sys
sys.path.append('../utils')
import functions

In [3]:
mediastack_df = pd.read_csv('../data/raw/news_sentiment_analysis.csv')

mediastack_df.head()

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Sentiment,Type
0,stgnews,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,2024-07-12T23:45:25+00:00,positive,Business
1,Zimbabwe Mail,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,2024-07-12T22:59:42+00:00,neutral,Business
2,4-traders,,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,2024-07-12T22:52:55+00:00,positive,Business
3,4-traders,,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,2024-07-12T22:41:01+00:00,negative,Business
4,PLANET,,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,2024-07-12T22:28:19+00:00,positive,Business


#### Dataset overview

In [4]:
functions.show_basic_info(mediastack_df)


DataFrame Shape: (3500, 8)
Number of Rows: 3500
Number of Columns: 8

Data Types of Columns:
Source          object
Author          object
Title           object
Description     object
URL             object
Published At    object
Sentiment       object
Type            object
dtype: object

Missing Values per Column:
Source            0
Author          988
Title             0
Description       0
URL               0
Published At      0
Sentiment         0
Type              0
dtype: int64

First 5 Rows of Data:
          Source          Author  \
0        stgnews  Bridger Palmer   
1  Zimbabwe Mail  Staff Reporter   
2      4-traders             NaN   
3      4-traders             NaN   
4         PLANET             NaN   

                                               Title  \
0  Pine View High teacher wins Best in State awar...   
1  Businesses Face Financial Strain Amid Liquidit...   
2  Musk donates to super pac working to elect Tru...   
4                          Rooftop solar's 

In [5]:
functions.show_data_types(mediastack_df)

Data Types of Columns:
Source          object
Author          object
Title           object
Description     object
URL             object
Published At    object
Sentiment       object
Type            object
dtype: object


In [6]:
functions.show_missing_values(mediastack_df)


Missing Values in Columns:
Source            0
Author          988
Title             0
Description       0
URL               0
Published At      0
Sentiment         0
Type              0
dtype: int64


In [7]:
functions.check_for_duplicates(mediastack_df)


There are 737 duplicate rows in the DataFrame.


#### Check earliest and latest date in the dataset

In [8]:
earliest_date = mediastack_df['Published At'].min()
latest_date = mediastack_df['Published At'].max()

print(f"Earliest date: {earliest_date}")
print(f"Latest date: {latest_date}")

Earliest date: 2024-07-11T17:01:00+00:00
Latest date: 2024-07-18T19:55:52+00:00


### **Data Cleaning**

1. Standardize column names
2. Replace empty values in the 'author' column with 'Unknown'
3. Standardize publishing date format
4. Convert categories to lowercase
5. Clean source names
6. Remove non-English rows


1. Rename and reorder columns

In [9]:
rename_dict = {
    'Source': 'source',
    'Author': 'author',
    'Title': 'title',
    'Description': 'description',
    'URL': 'url',
    'Published At': 'published_at',
    'Sentiment': 'sentiment',
    'Type': 'category'
}

mediastack_df = functions.rename_columns(mediastack_df, rename_dict)

In [10]:
new_order = ['author', 'title', 'description', 'url', 'source', 'category', 'published_at', 'sentiment']
mediastack_df = mediastack_df[new_order]

2. Replace empty values in the 'author' column with 'Unknown'

In [11]:
def replace_empty_values(df, column_name, default_value):
    """
    Replaces empty or missing values in the specified column with a default value.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column to check for empty values.
    - default_value (str): The value to replace empty or missing values with.
    
    Returns:
    - pd.DataFrame: The modified DataFrame with empty values replaced.
    """
    df[column_name] = df[column_name].replace(['', None], default_value)
    return df

In [12]:
mediastack_df = replace_empty_values(mediastack_df, 'author', 'Unknown')

3. Standardize publishing date format

In [13]:
def standardize_date_format(df, column_name):
    """
    Standardizes the date format in the specified column to 'YYYY-MM-DD'.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column containing the date to standardize.
    
    Returns:
    - pd.DataFrame: The modified DataFrame with the standardized date format.
    """
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce').dt.strftime('%Y-%m-%d')
    return df

In [14]:
mediastack_df = standardize_date_format(mediastack_df, 'published_at')

mediastack_df.head()

Unnamed: 0,author,title,description,url,source,category,published_at,sentiment
0,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,stgnews,Business,2024-07-12,positive
1,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,Zimbabwe Mail,Business,2024-07-12,neutral
2,Unknown,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,4-traders,Business,2024-07-12,positive
3,Unknown,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,4-traders,Business,2024-07-12,negative
4,Unknown,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,PLANET,Business,2024-07-12,positive


4. Categories to lowercase

In [15]:
functions.convert_strings_to_lowercase(mediastack_df, 'category')

Unnamed: 0,author,title,description,url,source,category,published_at,sentiment
0,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,stgnews,business,2024-07-12,positive
1,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,Zimbabwe Mail,business,2024-07-12,neutral
2,Unknown,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,4-traders,business,2024-07-12,positive
3,Unknown,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,4-traders,business,2024-07-12,negative
4,Unknown,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,PLANET,business,2024-07-12,positive
...,...,...,...,...,...,...,...,...
3495,MarketBeat News,"Arrow Electronics, Inc. (NYSE:ARW) Shares Purc...",QRG Capital Management Inc. increased its stak...,https://www.etfdailynews.com/2024/07/18/arrow-...,etfdailynews,technology,2024-07-18,positive
3496,MarketBeat News,"3,120 Shares in NICE Ltd. (NASDAQ:NICE) Bought...",QRG Capital Management Inc. bought a new posit...,https://www.etfdailynews.com/2024/07/18/3120-s...,etfdailynews,technology,2024-07-18,positive
3497,MarketBeat News,"QRG Capital Management Inc. Has $857,000 Stock...",QRG Capital Management Inc. boosted its stake ...,https://www.etfdailynews.com/2024/07/18/qrg-ca...,etfdailynews,technology,2024-07-18,positive
3498,Unknown,Biotechnology Market: Surging Investments and ...,"WESTFORD, Mass., July 18, 2024 /PRNewswire/ --...",https://www.finanznachrichten.de/nachrichten-2...,finanznachrichten,technology,2024-07-18,neutral


5. Clean source names

In [16]:
mediastack_df['clean_source'] = mediastack_df['source'].apply(functions.clean_source_names)
mediastack_df.head()

Unnamed: 0,author,title,description,url,source,category,published_at,sentiment,clean_source
0,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,stgnews,business,2024-07-12,positive,Stgnews
1,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,Zimbabwe Mail,business,2024-07-12,neutral,Zimbabwe Mail
2,Unknown,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,4-traders,business,2024-07-12,positive,4-Traders
3,Unknown,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,4-traders,business,2024-07-12,negative,4-Traders
4,Unknown,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,PLANET,business,2024-07-12,positive,Planet


6. Remove non-English rows

In [17]:
DetectorFactory.seed = 0    # seed for consistency in language detection

def detect_languages_and_filter(df, text_column):
    """
    Detects languages in a specified column of a Pandas DataFrame and filters out non-English rows.

    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        text_column (str): The name of the column with the text data.

    Returns:
        pd.DataFrame: Filtered DataFrame containing only English rows.
    """
    detected_languages = []

    for text in df[text_column]:
        try:
            lang = detect(text)
            detected_languages.append(lang)
        except LangDetectException:
            detected_languages.append('unknown')

    df['detected_language'] = detected_languages

    filtered_df = df[df['detected_language'] == 'en']   # filter out rows in English

    filtered_df = filtered_df.drop(columns=['detected_language'])   # drop the 'detected_language' column

    return filtered_df

filtered_df = detect_languages_and_filter(mediastack_df, 'description')

In [None]:
display(filtered_df)

#### Value counts

In [None]:
print('Value counts for category column:')
functions.show_column_value_counts(filtered_df, 'category')
print('Value counts for clean_source column:')
functions.show_column_values(filtered_df, 'clean_source')