In [1]:
import pandas as pd
import os

In [2]:
classified_videos_path = 'data/climate_videos_classified.jsonl'
climate_videos_classified_df = pd.read_json(classified_videos_path, lines=True)

refined_filtered_videos_path = 'data/refine_filtered_videos.jsonl'
climate_videos_refined_filtered_df = pd.read_json(refined_filtered_videos_path, lines=True)

In [3]:
climate_videos_classified_df['classification_categories_list'] = climate_videos_classified_df['classification_categories'].apply(lambda x: x.keys())

In [4]:

climate_videos_refined_filtered_df = climate_videos_refined_filtered_df.merge(climate_videos_classified_df[['display_id', 'classification_categories']], on='display_id', how='left')

## Get unique categories, filter them

In [23]:
most_common_categories = climate_videos_classified_df['classification_categories'].explode().unique()
print(len(most_common_categories))

1065


In [37]:
# remove nan and non category values
most_common_categories = [category for category in most_common_categories 
                          if isinstance(category, str) and category.startswith('/')]
print(len(most_common_categories))

1063


In [38]:
# save in txt file
with open('categories.txt', 'w') as file:
    for category in most_common_categories:
        file.write(str(category) + '\n')

In [54]:
climate_related_categories = []

# Keywords related to climate change and environmental issues
keywords = ["Climate Change", "Global Warming", "Renewable Energy", "Environment", "Ecology",
            "Green Living", "Sustainability", "Energy Efficiency", "Alternative Energy",
            "Emissions", "Carbon", "Solar", "Wind Power", "Hydroelectric", "Geothermal", "Bioenergy",
            "Environmental Conservation", "Air Quality", "Pollution", "Recycling",
            "Water Conservation", "Deforestation", "Biodiversity", "Ocean Conservation", "Greenhouse", "Eco"]

# Read the file and filter categories
for cat in most_common_categories:
    if any(keyword in cat for keyword in keywords):
        climate_related_categories.append(cat)

climate_related_categories


['/People & Society/Social Issues & Advocacy/Green Living & Environmental Issues',
 '/Science/Ecology & Environment/Other',
 '/Science/Ecology & Environment/Climate Change & Global Warming',
 '/Business & Industrial/Energy & Utilities/Renewable & Alternative Energy',
 '/Shopping/Green & Eco-Friendly Shopping',
 '/News/Business News/Economy News',
 '/People & Society/Social Sciences/Economics',
 '/Travel & Transportation/Specialty Travel/Ecotourism',
 '/Health/Health Conditions/Allergies/Environmental Allergies']

Manuelly remove non relevant ones:

In [55]:
climate_related_categories.remove('/News/Business News/Economy News')
climate_related_categories.remove('/People & Society/Social Sciences/Economics')
climate_related_categories.remove('/Health/Health Conditions/Allergies/Environmental Allergies')


In [56]:
climate_related_categories

['/People & Society/Social Issues & Advocacy/Green Living & Environmental Issues',
 '/Science/Ecology & Environment/Other',
 '/Science/Ecology & Environment/Climate Change & Global Warming',
 '/Business & Industrial/Energy & Utilities/Renewable & Alternative Energy',
 '/Shopping/Green & Eco-Friendly Shopping',
 '/Travel & Transportation/Specialty Travel/Ecotourism']

## Create & export dataframe with double filtered videos

In [66]:
df = climate_videos_refined_filtered_df[climate_videos_refined_filtered_df['classification_categories']
                                        .apply(lambda x: any(category in x.keys() for category in climate_related_categories))]
len(df)

70231

In [67]:
df.to_json('climate_videos_v3.jsonl', orient='records', lines=True)

## Visu Categories

In [None]:
top_30_categories = pd.Series(most_common_categories).head(30)
top_30_categories.plot(kind='barh', figsize=(10, 10), title='Top 30 Categories')


In [None]:
climate_change_df = climate_videos_refined_filtered_df[climate_videos_refined_filtered_df['classification_categories'].apply(lambda x: any('Climate Change & Global Warming' in category for category in x.keys()))]


look for videos where the 'classification_categories' list contains a string containing 'Renewable & Alternative' but that is not in df

In [None]:
df = climate_videos_refined_filtered_df[
    (climate_videos_refined_filtered_df['classification_categories'].apply(lambda x: any('Green Living' in category for category in x.keys()))) |
    (climate_videos_refined_filtered_df['classification_categories'].apply(lambda x: any('Climate Change & Global Warming' in category for category in x.keys())))
]




In [None]:
renewable_alternative_videos = climate_videos_refined_filtered_df[
    (climate_videos_refined_filtered_df['classification_categories'].apply(lambda x: any('Accidents & Disasters' in category for category in x.keys()))) &
    ~(climate_videos_refined_filtered_df['display_id'].isin(df['display_id']))
]

In [None]:
len(renewable_alternative_videos)

3620

In [None]:
renewable_alternative_videos.loc[45179, 'description']


"http://www.Suspicious0bservers.org\xa0\nhttp://www.SpaceWeatherNews.com\nhttp://www.MagneticReversal.org\nhttp://www.ObservatoryProject.com\nhttp://www.EarthChanges.org\n\nSolar Alerts on Twitter: https://twitter.com/TheRealS0s\nTHE DISASTER PREDICTION APP: http://kck.st/1RO4K82\n\nGood Videos/Articles:\nThe Sun is Going to Sleep: http://www.youtube.com/watch?v=7whL9jvdL5s\nTop 6 Climate Change Problems: http://www.youtube.com/watch?v=4Ew05sRDAcU\nPause on Pausing the Pause: http://www.youtube.com/watch?v=CZH46p7MUlw\nSun Series: http://www.youtube.com/playlist?list=PLHSoxioQtwZcJj_9clLz7Bggso7qg2PDj\nIPCC History: http://www.suspicious0bservers.org/selections-from-the-1st-ipcc-full-report-wg1/\n\nToday's Featured Links:\nLa Nina? https://weather.com/news/climate/news/la-nina-noaa-update-september\n\nOriginal music by NEMES1S\n\nWORLD WEATHER:\nTY WindMap: https://www.windyty.com\nEarth WindMap: http://earth.nullschool.net/#current/wind/isobaric/850hPa/orthographic=-345.32,51.43,481\n

In [None]:
renewable_alternative_videos.sample(10).title

73213    Drone footage shows panorama of Shenzhen lands...
35063                Sharks Discovered Living In A Volcano
82282     Snow Storm Disrupts Travel, Makes Life Difficult
94379                   The Aftermath of Natural Disasters
31478                 Flooding in Texas view from airplane
42978    Rush Limbaugh EVACUATES Florida After Calling ...
80637    Rasamayi Balakishan Vs Ponnam Prabhakar | One ...
74132    This day in history May 17th: natural disaster...
45179    Volcanos, Quakes, Typhoon MagStorm Watch | S0 ...
60614               Floods in western India kill 20 people
Name: title, dtype: object