In [1]:
# 1.
from datetime import datetime as dt, timezone
import pandas as pd
from numpy import argmax
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.tsa.stattools import grangercausalitytests
import json

In [2]:
articles = pd.read_csv('data/sentiment_article_data_new.csv', sep=',', encoding='ISO-8859-1')

articles.head()

Unnamed: 0,index_0,state,index_1,abstract,web_url,headline,keywords,pub_date,byline,word_count,text,tokenized_text,vader,vader_discrete,sentiment
0,0,Alabama,0,The U.S. had an opportunity to solve the clima...,https://www.nytimes.com/2018/08/31/podcasts/th...,{'main': 'When We Almost Stopped Climate Chang...,"[{'name': 'subject', 'value': 'Global Warming'...",2018-08-31T09:59:54+0000,"{'original': None, 'person': [], 'organization...",228,When We Almost Stopped Climate Change The U.S....,almost stopped climate change u opportunity so...,-0.6378,NEGATIVE,NEGATIVE
1,1,Alabama,1,Scientists cannot say for sure what causes alg...,https://www.nytimes.com/2018/08/29/science/lak...,{'main': 'Algae Bloom in Lake Superior Raises ...,"[{'name': 'subject', 'value': 'Lakes', 'rank':...",2018-08-29T17:34:54+0000,"{'original': 'By Christine Hauser', 'person': ...",951,In 19 years of piloting his boat around Lake S...,19 year piloting boat around lake superior jod...,0.9519,POSITIVE,POSITIVE
2,2,Alabama,2,Federer struggled in the nighttime heat and hu...,https://www.nytimes.com/2018/09/04/climate/rog...,{'main': 'Roger Federer Is Tough to Beat. Glob...,"[{'name': 'subject', 'value': 'Global Warming'...",2018-09-04T22:16:22+0000,"{'original': 'By Kendra Pierre-Louis', 'person...",591,Want climate news in your inbox? Sign up here ...,want climate news inbox sign climate fwd email...,-0.3612,NEGATIVE,NEGATIVE
3,3,Alabama,3,A dozen artistic responses to one of the great...,https://www.nytimes.com/2018/08/22/t-magazine/...,"{'main': '12 Artists On: Climate Change', 'kic...","[{'name': 'subject', 'value': 'Art', 'rank': 1...",2018-08-22T15:02:32+0000,"{'original': 'By ZoÃ« Lescaze', 'person': [{'f...",2304,"Mel Chin\n\nBorn in Houston, Tex., in 1951.\n\...",mel chin born houston tex 1951 new york city c...,0.7579,POSITIVE,NEGATIVE
4,4,Alabama,4,This week world leaders are meeting in New Yor...,https://www.nytimes.com/2018/09/26/climate/cli...,"{'main': 'Climate Week, and Telling Stories Wi...","[{'name': 'subject', 'value': 'Global Warming'...",2018-09-26T17:49:12+0000,"{'original': None, 'person': [], 'organization...",923,If youâve ever wondered what itâs like to ...,youve ever wondered like work isolated researc...,0.7964,POSITIVE,POSITIVE


In [3]:
climate_data = pd.read_csv('data/climate_data.csv', sep=',', encoding='ISO-8859-1')

climate_data.head()

Unnamed: 0,date,state,max_avg_temperature,min_avg_temperature,mean_temperature,max_temperature,min_temperature,num_frost_days,num_trop_nights,max_1day_precipitation,max_5day_cumulative_precipitation,precipitation
0,1950-01,USA,2.46,-7.38,-3.15,12.87,-19.08,21.57,0.21,17.24,40.6,90.09
1,1950-01,Alabama,20.03,12.03,15.29,25.26,0.16,0.81,0.24,24.25,41.76,99.14
2,1950-01,Alaska,-11.85,-19.05,-15.64,-2.19,-30.29,30.41,0.0,12.64,36.8,53.15
3,1950-01,Arizona,11.32,-1.62,3.74,20.46,-12.0,17.77,0.0,7.63,24.52,26.25
4,1950-01,Arkansas,13.78,4.84,8.75,25.18,-5.77,10.71,0.68,48.4,129.4,299.96


In [4]:
def get_formatted_dates(data_slice, date_col, format: str):
    return [
        dt.strptime(date_str.split('T')[0], format)
        for date_str in data_slice[date_col]
    ]

articles['parsed_date'] = get_formatted_dates(
    articles,
    'pub_date',
    '%Y-%m-%d'
)

climate_data['parsed_date'] = get_formatted_dates(
    climate_data,
    'date',
    '%Y-%m'
)

In [6]:
core_cols = [
	'state',
	'parsed_date',
]

article_cols = [
	'vader',
	'vader_discrete',
	'sentiment',
]

climate_cols = [
	'max_avg_temperature',
	'min_avg_temperature',
	'mean_temperature',
	'max_temperature',
	'min_temperature',
	'num_frost_days',
	'num_trop_nights',
	'max_1day_precipitation',
	'max_5day_cumulative_precipitation',
	'precipitation',
]

articles = articles[core_cols + article_cols]
print(articles.head(10))

print(articles['parsed_date'].max())
print(articles['parsed_date'].min())

     state parsed_date   vader vader_discrete sentiment
0  Alabama  2018-08-31 -0.6378       NEGATIVE  NEGATIVE
1  Alabama  2018-08-29  0.9519       POSITIVE  POSITIVE
2  Alabama  2018-09-04 -0.3612       NEGATIVE  NEGATIVE
3  Alabama  2018-08-22  0.7579       POSITIVE  NEGATIVE
4  Alabama  2018-09-26  0.7964       POSITIVE  POSITIVE
5  Alabama  2018-08-30 -0.8074       NEGATIVE  POSITIVE
6  Alabama  2018-09-21  0.9284       POSITIVE  POSITIVE
7  Alabama  2018-09-30 -0.7650       NEGATIVE  NEGATIVE
8  Alabama  2018-09-26  0.0997       POSITIVE  POSITIVE
9  Alabama  2018-09-27  0.7670       POSITIVE  NEGATIVE
2020-09-30 00:00:00
2018-08-22 00:00:00


In [67]:
import warnings
import math
from statsmodels.tools.sm_exceptions import InfeasibleTestError

warnings.filterwarnings('ignore')

def run_grangers_causality(
	data_slice: pd.DataFrame,
	climate_cols: list,
	article_cols: list,
	max_lags: int = 2
):
	p_values = [[0] * len(article_cols) for _ in range(len(climate_cols))]
	for i, c_col in enumerate(climate_cols):
		for j, a_col in enumerate(article_cols):
			test_slice = data_slice[[a_col, c_col]].dropna()
			test_slice = test_slice[test_slice[a_col] != float('inf')]
			test_slice = test_slice[test_slice[c_col] != float('inf')]
			best_p_value = 1.0
			try:
				results = grangercausalitytests(
					test_slice,
					maxlag=max_lags,
					verbose=False
				)
				best_p_value = min(
					[results[i][0]['ssr_chi2test'][1] for i in range(1, max_lags + 1)]
				)
				best_p_value = round(best_p_value, 4)
			except (InfeasibleTestError, ValueError):
				best_p_value = 1.0
				
			p_values[i][j] = best_p_value
	return pd.DataFrame(p_values, index=climate_cols, columns=article_cols)

ALPHA = 0.05
LOWER_BOUND = 0.005

def analyze_state(state: str, output_file)-> None:

	state_article_data = articles[articles['state'] == state].sort_values('parsed_date')
	polarity_to_float = lambda polarity: 1.0 if polarity == 'POSITIVE' else -1.0
	state_article_data['vader_discrete'] = state_article_data['vader_discrete'].apply(polarity_to_float)
	state_article_data['sentiment'] = state_article_data['sentiment'].apply(polarity_to_float)
	state_climate_data = climate_data[climate_data['state'] == state].sort_values('parsed_date')

	merged_data = pd.merge_asof(
		state_article_data,
		state_climate_data,
		on='parsed_date',
	)

	climate_diff_cols = []
	for col in climate_cols:
		diff_col = col + '_diff'
		merged_data[diff_col] = merged_data[col].diff()
		climate_diff_cols.append(diff_col)

	article_diff_cols = []
	for col in article_cols:
		diff_col = col + '_diff'
		merged_data[diff_col] = merged_data[col].diff()
		article_diff_cols.append(diff_col)

	results = run_grangers_causality(
		merged_data,
		climate_diff_cols,
		article_diff_cols,
	)

	print(state, file=output_file)
	count = 0
	for predictor in results.index:
		for observed in results.columns:
			p_value = results.loc[predictor, observed]
			if observed != 'vader_diff':
				continue
			if p_value < ALPHA:
				print(f'\t{predictor} forecasts {observed}, p value: {p_value} ', file=output_file)
				count += 1
	if count == 0:
		print('\tno significant forecasting relationships identified', file=output_file)

with open('forecast_results.txt', 'w') as output_file:
	for state in articles['state'].unique():
		analyze_state(state, output_file=output_file)

In [18]:
import pandas as pd
results = {}
with open('forecast_results.txt', 'r') as file:
    current_state = None
    for line in file:
        line = line.strip()
        if 'forecasts' in line or 'no significant forecasting relationships identified' in line:
            if current_state not in results:
                results[current_state] = {'Climate Data': [], 'P Values': [], 'Count': 0}

            if 'no significant forecasting relationships identified' in line:
                continue
            else:
                parts = line.split(', p value: ')
                predictor, _ = parts[0].split(' forecasts ')
                p_value = float(parts[1])
                results[current_state]['Climate Data'].append(predictor)
                results[current_state]['P Values'].append(p_value)
                results[current_state]['Count'] += 1
        else:
            current_state = line

results = pd.DataFrame.from_dict(results, orient='index')
results.reset_index(inplace=True)
results.rename(columns={'index': 'State'}, inplace=True)
results.to_csv('/Users/ElenaPerego/Desktop/Columbia/Natural Language Processing/qmss-nlp-climate/data/results.csv')
