In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib as plt
import seaborn as sns
import numpy as np
import statsmodels.formula.api as sm



In [2]:
#custom style HTML output

from IPython.core.display import HTML

csspath1 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\style-table.css'
csspath2 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\style-notebook.css'

css = open(csspath1).read() + open(csspath2).read()
HTML('<style>{}</style>'.format(css))

In [13]:
# Read in data

cc = pd.read_csv('DATA\climate_change.csv')
cc.shape

(308, 11)

In [14]:
cc.head(1)

Unnamed: 0,Year,Month,MEI,CO2,CH4,N2O,CFC-11,CFC-12,TSI,Aerosols,Temp
0,1983,5,2.556,345.96,1638.59,303.677,191.324,350.113,1366.1024,0.0863,0.109


In [15]:
# Rename CFC-11 and CFC-12 column names because it confuses statsmodels

cc.rename(columns={'CFC-11':'CFC11','CFC-12':'CFC12'},inplace=True)

In [16]:
# split the data into a training set, consisting of all 
# the observations up to and including 2006, and a testing 
# set consisting of the remaining years

train = cc[cc.Year <= 2006]
test = cc[~(cc.Year <= 2006)]

train.shape , test.shape

((284, 11), (24, 11))

In [17]:
# build a linear regression model to predict the 
# dependent variable Temp, using MEI, CO2, CH4, N2O, CFC.11, 
# CFC.12, TSI, and Aerosols as independent variables 
# (Year and Month should NOT be used in the model). 
# Use the training set to build the model.

linreg1 = sm.ols(formula="Temp ~ MEI + CO2 + CH4 + N2O + CFC11 + CFC12 + TSI + Aerosols", 
                 data=train).fit()

print(linreg1.summary2())

                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.744    
Dependent Variable: Temp             AIC:                -542.2023
Date:               2016-04-30 10:17 BIC:                -509.3615
No. Observations:   284              Log-Likelihood:     280.10   
Df Model:           8                F-statistic:        103.6    
Df Residuals:       275              Prob (F-statistic): 1.94e-78 
R-squared:          0.751            Scale:              0.0084110
------------------------------------------------------------------
                Coef.   Std.Err.    t    P>|t|    [0.025   0.975] 
------------------------------------------------------------------
Intercept     -124.5943  19.8868 -6.2652 0.0000 -163.7440 -85.4446
MEI              0.0642   0.0065  9.9232 0.0000    0.0515   0.0769
CO2              0.0065   0.0023  2.8264 0.0051    0.0020   0.0110
CH4              0.0001   0.0005  0.2405 0.8101   -0.0009   0.0011
N2O          

In [18]:
# Which variables are significant in the model? 
# We will consider a variable signficant only if the p-value is below 0.05

(linreg1.pvalues < 0.05).sort_values()

CH4          False
N2O          False
Intercept     True
MEI           True
CO2           True
CFC11         True
CFC12         True
TSI           True
Aerosols      True
dtype: bool

In [19]:
# Compute the correlations between all the variables in the training set. 

train.corr()

Unnamed: 0,Year,Month,MEI,CO2,CH4,N2O,CFC11,CFC12,TSI,Aerosols,Temp
Year,1.0,-0.027942,-0.036988,0.982749,0.915659,0.993845,0.569106,0.897012,0.170302,-0.345247,0.786797
Month,-0.027942,1.0,0.000885,-0.106732,0.018569,0.013632,-0.013111,0.000675,-0.034606,0.01489,-0.099857
MEI,-0.036988,0.000885,1.0,-0.041147,-0.033419,-0.05082,0.069,0.008286,-0.154492,0.340238,0.172471
CO2,0.982749,-0.106732,-0.041147,1.0,0.87728,0.97672,0.51406,0.85269,0.177429,-0.356155,0.788529
CH4,0.915659,0.018569,-0.033419,0.87728,1.0,0.899839,0.779904,0.963616,0.245528,-0.267809,0.703255
N2O,0.993845,0.013632,-0.05082,0.97672,0.899839,1.0,0.522477,0.867931,0.199757,-0.337055,0.778639
CFC11,0.569106,-0.013111,0.069,0.51406,0.779904,0.522477,1.0,0.868985,0.272046,-0.043921,0.40771
CFC12,0.897012,0.000675,0.008286,0.85269,0.963616,0.867931,0.868985,1.0,0.255303,-0.225131,0.687558
TSI,0.170302,-0.034606,-0.154492,0.177429,0.245528,0.199757,0.272046,0.255303,1.0,0.052117,0.243383
Aerosols,-0.345247,0.01489,0.340238,-0.356155,-0.267809,-0.337055,-0.043921,-0.225131,0.052117,1.0,-0.384914


In [20]:
# Which independent variables is N2O highly correlated with (absolute correlation greater than 0.7)?

def high_cor(var,df,limit):
    for variable in df.columns:
        if variable not in ['Year','Month',var,'Temp']:
            cor = np.corrcoef( df[var] , df[variable] )[0][1]
            print('{0} with {1} ::: {2} ::: {3}'.format(var,
                                                        variable,
                                                        cor,
                                                        str(abs(cor) > limit).upper()))
            
high_cor('N2O',train,0.7)

N2O with MEI ::: -0.05081977547232952 ::: FALSE
N2O with CO2 ::: 0.9767198182174774 ::: TRUE
N2O with CH4 ::: 0.8998386437662852 ::: TRUE
N2O with CFC11 ::: 0.5224773187940129 ::: FALSE
N2O with CFC12 ::: 0.8679307757085132 ::: TRUE
N2O with TSI ::: 0.1997566794275066 ::: FALSE
N2O with Aerosols ::: -0.33705457070654865 ::: FALSE


In [21]:
# Which independent variables is CFC.11 highly correlated with? 

high_cor('CFC11',train,0.7)

CFC11 with MEI ::: 0.06900043872431204 ::: FALSE
CFC11 with CO2 ::: 0.514059748009346 ::: FALSE
CFC11 with CH4 ::: 0.7799040200233145 ::: TRUE
CFC11 with N2O ::: 0.5224773187940129 ::: FALSE
CFC11 with CFC12 ::: 0.8689851827830869 ::: TRUE
CFC11 with TSI ::: 0.2720459602966328 ::: FALSE
CFC11 with Aerosols ::: -0.04392119814776429 ::: FALSE


In [22]:
# Given that the correlations are so high, let us focus on the 
# N2O variable and build a model with only MEI, TSI, Aerosols 
# and N2O as independent variables. Remember to use the training 
# set to build the model.

# Find the coefficient of N2O in this reduced model.

linreg2 = sm.ols(formula="Temp ~ MEI + N2O + TSI + Aerosols", 
                 data=train).fit()

print(linreg2.summary2())

                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.722    
Dependent Variable: Temp             AIC:                -523.2892
Date:               2016-04-30 10:19 BIC:                -505.0444
No. Observations:   284              Log-Likelihood:     266.64   
Df Model:           4                F-statistic:        184.9    
Df Residuals:       279              Prob (F-statistic): 3.52e-77 
R-squared:          0.726            Scale:              0.0091145
------------------------------------------------------------------
                Coef.   Std.Err.    t    P>|t|    [0.025   0.975] 
------------------------------------------------------------------
Intercept     -116.2269  20.2230 -5.7473 0.0000 -156.0360 -76.4178
MEI              0.0642   0.0067  9.6494 0.0000    0.0511   0.0773
N2O              0.0253   0.0013 19.3069 0.0000    0.0227   0.0279
TSI              0.0795   0.0149  5.3437 0.0000    0.0502   0.1088
Aerosols     