In [1]:
import pandas as pd
import matplotlib.pyplot as plot
from sklearn import linear_model,datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import r2_score

In [2]:
def read_file1(datafile, metadatafile):
    df_actualdata = pd.read_csv(datafile,index_col=False,header=2)
    df_actualdata = df_actualdata[df_actualdata.columns[:-1]]
    df_actualdata= df_actualdata.fillna(0)
    
    df_metadata = pd.read_csv(metadatafile,header=0)
    df_metadata = df_metadata[['Country Code', 'Region', 'IncomeGroup']]
    df_metadata = df_metadata.dropna()
    
    df_actual_metadata_merged = df_actualdata.merge(df_metadata,on=['Country Code'])
    
    return df_actual_metadata_merged

In [3]:
def read_file2(datafile, metadatafile):
    df_actualdata = pd.read_csv(datafile,index_col=False,header=4)
    df_actualdata = df_actualdata[df_actualdata.columns[:-1]]
    df_actualdata= df_actualdata.fillna(0)
    
    df_metadata = pd.read_csv(metadatafile,header=0)
    df_metadata = df_metadata[['Country Code', 'Region', 'IncomeGroup']]
    df_metadata = df_metadata.dropna()
    
    df_actual_metadata_merged = df_actualdata.merge(df_metadata,on=['Country Code'])
    
    return df_actual_metadata_merged

In [20]:
co2_emission = read_file1("/BusinessIntelligence/Datasets/CO2Emissions/co2emissions_data_worldbank.csv","/BusinessIntelligence/Datasets/CO2Emissions/metadata_country_co2emission.csv")
gdp = read_file1("/BusinessIntelligence/Datasets/GDP/gdp_data_worldbank.csv","/BusinessIntelligence/Datasets/GDP/metadata_country_gdp.csv")
population = read_file1("/BusinessIntelligence/Datasets/Population/population_data_worldbank.csv","/BusinessIntelligence/Datasets/Population/metadata_country_population.csv")


In [9]:
co2_emission = co2_emission.drop(['Indicator Name','Indicator Code','2015','2016','2017','2018'],axis=1)
gdp = gdp.drop(['Country Name','Indicator Name','Indicator Code','2015','2016','2017','Region','IncomeGroup'],axis=1)
population = population.drop(['Country Name','Indicator Name','Indicator Code','2015','2016','2017','Region','IncomeGroup'],axis=1)


In [10]:
co2_emission_melted = pd.melt(co2_emission,id_vars=['Country Name','Country Code','Region','IncomeGroup'],
                             var_name='year', value_name='co2_emission')

gdp_melt = pd.melt(gdp,id_vars=['Country Code'],
                             var_name='year', value_name='gdp')
                                
population_melt = pd.melt(population,id_vars=['Country Code'],
                             var_name='year', value_name='population')



In [11]:
co2_emission_gdp_population = co2_emission_melted.merge(
    gdp_melt, on = ['Country Code','year']).merge(population_melt,on = ['Country Code','year'])

co2_emission_gdp_population = co2_emission_gdp_population[(co2_emission_gdp_population != 0).all(1)]


In [12]:
co2_emission_gdp_population.to_excel('CO2Emission_GDP_Population.xlsx', index=False)

In [13]:
df_co2_emission_gdp_population = co2_emission_gdp_population.drop(['Country Name','Country Code', 
                                                                   'Region', 'IncomeGroup'], axis=1)


df_co2_emission_gdp_population.corr()


Unnamed: 0,co2_emission,gdp,population
co2_emission,1.0,0.763989,0.613156
gdp,0.763989,1.0,0.318468
population,0.613156,0.318468,1.0


In [14]:
co2_emission_gdp_population['year'] = co2_emission_gdp_population['year'].astype('int')

df = co2_emission_gdp_population.drop(['Country Name','Country Code',
                                       'Region', 'IncomeGroup'], axis=1)

train =  df[co2_emission_gdp_population['year'] <= 2011]
test  =  df[co2_emission_gdp_population['year'] > 2011]

train_x = train.drop(['year','co2_emission'],axis=1)
test_x  = test.drop(['year','co2_emission'],axis=1)

train_y =  train['co2_emission']
test_y  =  test['co2_emission']

In [15]:
regr = linear_model.LinearRegression()
regr.fit(train_x,train_y)

y_pred = regr.predict(test_x)

linear_test_score = r2_score(test_y,y_pred)

print('Linear Regression Test Score:%.2f' % linear_test_score )


Linear Regression Test Score:0.78


In [16]:
rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0)
rf.fit(train_x,train_y)

predicted_test = rf.predict(test_x)

test_score = r2_score(test_y,predicted_test)

print('Random Forest Test Score:%.2f' % test_score )



Random Forest Test Score:0.87


In [17]:
test['predicted_co2_emmissions'] = predicted_test
test.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,year,co2_emission,gdp,population,predicted_co2_emmissions
11284,2012,1349.456,2534637000.0,102577.0,2545.41138
11285,2012,10755.311,19907320000.0,30696958.0,51865.629962
11286,2012,33399.036,128052900000.0,25096150.0,195370.147308
11287,2012,4910.113,12319780000.0,2900401.0,6906.457136
11288,2012,487.711,3164615000.0,82431.0,548.832556
11289,2012,176386.367,374590600000.0,8900453.0,56613.111512
11290,2012,192356.152,545982400000.0,42096739.0,286723.690754
11291,2012,5694.851,10619320000.0,2881922.0,6724.163232
11293,2012,524.381,1211412000.0,96777.0,541.491222
11294,2012,388126.281,1543411000000.0,22742475.0,399488.106466


In [18]:
co2_emission_gdp_population['year'] = co2_emission_gdp_population['year'].astype('int')
d_co2 =  co2_emission_gdp_population[co2_emission_gdp_population['year']>2011]
test['Country Name'] = d_co2['Country Name']
test.to_excel("TestData.xlsx",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
dt = DecisionTreeRegressor(random_state=0, criterion="mae")
dt_fit = dt.fit(train_x,train_y)

dt_predicted_test = dt_fit.predict(test_x)
dt_test_score = r2_score(test_y,dt_predicted_test)
print('Decision Tree Test Score:%.2f' % dt_test_score )



Decision Tree Test Score:0.81
