# Media and Source Classifier 

First we classify the media as it is an important part of our study. 

There are two main classifications: traditional and non-traditional

Then we further divided them into sub-categories to possibly gain more insight

In [1]:
import pandas as pd
import re

# Load the dataset
url_file_path= '{add directory}/post_id_with_url_extracted.csv'

url_df = pd.read_csv(url_file_path)

# Drop rows where the 'url' column has NaN values
url_df = url_df.dropna(subset=['url'])

# Extract domain names from the new URLs
url_df['domain'] = url_df['url'].str.extract(r'https?://([^/]+)')

# Define the classification function with specific cases for social media, video, and image domains
def classify_url_with_gfycat(url):
    social_media_domains = ['x.com', 'twitter.com', 'facebook.com', 'fb.com', 'instagram.com', 'tiktok.com']
    video_domains = ['youtube.com', 'youtu.be', 'vimeo.com', 'dailymotion.com', 'gfycat.com', 'v.redd.it']
    image_domains = ['i.reddit.com', 'imgur.com', 'redd.it', 'gyazo.com', 'flickr.com', 'tinypic.com']

    if any(domain in url for domain in social_media_domains):
        return 'social_media'
    elif any(domain in url for domain in video_domains) or \
         any(ext in url for ext in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv','.watch']):
        return 'video'
    elif any(domain in url for domain in image_domains) or \
         any(ext in url for ext in ['.jpg', '.jpeg', '.png', '.gif']):
        return 'image'
    elif 'reddit.com' in url or 'forum' in url or '/r/' in url:
        return 'discussion'
    else:
        return 'article'

# Check for any NaN or non-string values in the 'url' column
url_df = url_df.dropna(subset=['url'])
url_df = url_df[url_df['url'].apply(lambda x: isinstance(x, str))]

# Apply the updated classification function
url_df['sub_category'] = url_df['url'].apply(classify_url_with_gfycat)

# Create 'category' column based on 'sub_category'
url_df['category'] = url_df['sub_category'].apply(lambda x: 'traditional' if x == 'article' else 'nontraditional')



Now that we have the domain, we can extract the sources from it. I just mapped the most frequent sources as that is enough for my analysis.

In [2]:
# normalization function to ensure consistent formatting using the domain column
def normalize_source_from_domain(domain):
    domain = str(domain).lower()

    # Remove lingering prefixes
    domain = re.sub(r'^(www\d*|english|en|m|m-en|new)\.', '', domain)
    
    # Known mappings based on observed patterns
    specific_mappings = {
        'reddit.com': 'reddit',
        'redd.it': 'reddit',
        'thejc.com': 'thejewishchronicle',
        'm.jpost.com': 'thejerusalempost',
        'jpost.com': 'thejerusalempost',
        'bbc.com': 'bbc',
        'cnn.com': 'cnn',
        'nbcnews.com': 'nbcnews',
        'bloomberg.com': 'bloomberg',
        'ynetnews.com': 'ynetnews',
        'reuters.com': 'reuters',
        'dailynews.com': 'dailynews',
        'thestar.com': 'thestar',
        'theguardian.com': 'theguardian',
        'thehindu.com': 'thehindu',
        'pravda.com.ua': 'pravda',
        'navalnews.com': 'navalnews',
        'islamabadpost.com.pk': 'islamabadpost',
        'apnews.com': 'associatedpress',
        '1lurer.am': 'lurer',
        'iranintl.com': 'iraninternational',
        'aje.io': 'aljazeera',
        'allisrael.com': 'allisraelnews',
        'timesofisrael.com': 'thetimesofisrael',
        'elpais.com': 'elpais',
        'nytimes.com': 'thenewyorktimes',
        'straitstimes.com': 'thestraitstimes',
        'armradio.am': 'armenpress',
        'i24news.tv': 'inews',
        'youtu.be': 'youtube',
        'bhaskar.com': 'bhaskar',
        'kiis.com.ua': 'com',
        'sky.com': 'skynews',
        'skynews.com.au': 'skynews',
        'france24.com': 'france',
        'kyodonews.net': 'kyodonews',
        'peacekeeping.un.org': 'un',
        'news.un.org': 'un',
        'indiatimes.com': 'indiatimes',
        'joins.com': 'joins',
        '9news.com.au': 'com',
        'aa.com.tr': 'com',
        'wikipedia.org': 'wikipedia',
        'thejewishindependent.com.au': 'com',
        'imgur.com': 'imgur',
        'al-monitor.com': 'almonitor',
        'washingtonpost.com': 'thewashingtonpost',
        'yahoo.com': 'yahoonews',
        'jam-news.net': 'jamnews',
        'united24media.com': 'unitedmedia',
        'elhayat-life.com': 'elhayatlife',
        'cebudailynews.inquirer.net': 'inquirer',
        'images.dawn.com': 'dawn',
        'the-afc.com': 'theafc',
        'mongabay.com': 'mongabay',
        'abc.net.au': 'abc',
        'abcnews.go.com': 'abc',
        'www.abc57.com': 'abc',
        'abc13.com': 'abc',
        'huffpost.com': 'huffingtonpost',
        'i24news.tv': 'i24news',
        'france24.com': 'france24',
        'news18.com': 'news18',
        'nbcrightnow.com': 'nbcnews',
        'nbcnewyork.com': 'nbcnews',
        'iol.co.za': 'iol',
        'actblue.com': 'actblue',
        'twitter.com': 'x',
        'openstreetmap.org': 'openstreetmap',
        'ynet.co.il': 'ynet',
        'rbc.ua': 'rbc',
        'cp24.com': 'cp24',
        'www.cbsnews.com': 'cbc',
        'fb.watch': 'facebook',
        'usni.org': 'usni'
    }

    # If the domain matches a specific mapping, use that
    for key, value in specific_mappings.items():
        if key in domain:
            return value
    
    # Otherwise, use the first part of the domain as the source
    main_part = domain.split('.')[0]
    
    # Return the normalized source name
    return main_part

# Apply this function to the domain column
url_df['source'] = url_df['domain'].apply(normalize_source_from_domain)

# Fill NaN values in the 'source' column with 'reddit' since all NaN are reddit
url_df['source'] = url_df['source'].fillna('reddit')


Some sources were misclassified because of their prefixes, so we have to add specific logic to classify them properly.

In [3]:
# function to correct specific misclassified source 
def fix_source_from_domain(row):
    source = str(row['source']).lower()
    domain = str(row['domain']).lower()

    # List of specific suffixes that are misclassified as sources
    suffixes = ['europa', 'mod', 'com', 'ac', 'net', 'org', 'or']

    # Check if the source is one of the suffixes
    if source in suffixes:
        # Extract the correct source from the domain
        parts = domain.split('.')
        for i in range(len(parts)):
            if any(parts[i].endswith(suffix) for suffix in suffixes):
                return parts[i-1]  # Return the word before the suffix
    
    # If the source is correct, return it as is
    return row['source']

# Apply the function to the DataFrame to update the source column
url_df['source'] = url_df.apply(fix_source_from_domain, axis=1)


In [5]:
# Save the dataset
url_df.to_csv('/{add directory}/posts_media_classified_with_sources.csv', index=False)
