In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
df = pd.read_csv("co2_mm_mlo.txt", sep = r"\s*", skiprows = 72, 
                 header = None, 
                 names = ['Year', 'Month', 'Inferred_Date', 
                            'CO2_average_ppm', 'Interpolated_CO2_ppm', 'seasonal_trend',
                            'days'])

  """
  yield pat.split(line.strip())
  yield pat.split(line.strip())


In [3]:
df.tail()

Unnamed: 0,Year,Month,Inferred_Date,CO2_average_ppm,Interpolated_CO2_ppm,seasonal_trend,days
730,2019,1,2019.042,410.83,410.83,410.53,27
731,2019,2,2019.125,411.75,411.75,410.95,27
732,2019,3,2019.208,411.97,411.97,410.48,28
733,2019,4,2019.292,413.32,413.32,410.49,26
734,2019,5,2019.375,414.66,414.66,411.26,28


In [4]:
three_cols = df.iloc[:, [0, 1, 3]]

In [5]:
three_cols.head()

Unnamed: 0,Year,Month,CO2_average_ppm
0,1958,3,315.71
1,1958,4,317.45
2,1958,5,317.5
3,1958,6,-99.99
4,1958,7,315.86


In [6]:
three_cols.iloc[0,:]

Year               1958.00
Month                 3.00
CO2_average_ppm     315.71
Name: 0, dtype: float64

In [7]:

for i in range(three_cols.shape[0]):
    vals = three_cols.iloc[i,:]
    if vals['CO2_average_ppm'] == -99.99: 
        three_cols.iloc[i, 2] = np.nan
        
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
three_cols.head()

Unnamed: 0,Year,Month,CO2_average_ppm
0,1958,3,315.71
1,1958,4,317.45
2,1958,5,317.5
3,1958,6,
4,1958,7,315.86


In [9]:
CO2_measurements = three_cols.copy()

In [10]:
CO2_measurements['Day'] = 15

In [11]:
CO2_measurements.head()

Unnamed: 0,Year,Month,CO2_average_ppm,Day
0,1958,3,315.71,15
1,1958,4,317.45,15
2,1958,5,317.5,15
3,1958,6,,15
4,1958,7,315.86,15


In [12]:
CO2_measurements.dtypes

Year                 int64
Month                int64
CO2_average_ppm    float64
Day                  int64
dtype: object

In [13]:
dt.date(int(CO2_measurements.iloc[0, :]['Year']), 
        int(CO2_measurements.iloc[0, :]['Month']), 
        int(CO2_measurements.iloc[0, :]['Day']))

datetime.date(1958, 3, 15)

In [14]:
CO2_measurements.apply(lambda x: x['Year'], axis = 1).head()

0    1958.0
1    1958.0
2    1958.0
3    1958.0
4    1958.0
dtype: float64

In [15]:
CO2_measurements['Date'] = CO2_measurements.apply(lambda x: dt.date(year = int(x['Year']), 
                                                                    month = int(x['Month']), 
                                                                    day = int(x['Day'])), 
                                                 axis = 1)

In [16]:
CO2_measurements.head()

Unnamed: 0,Year,Month,CO2_average_ppm,Day,Date
0,1958,3,315.71,15,1958-03-15
1,1958,4,317.45,15,1958-04-15
2,1958,5,317.5,15,1958-05-15
3,1958,6,,15,1958-06-15
4,1958,7,315.86,15,1958-07-15


In [17]:
CO2_measurements = CO2_measurements.drop('Day', axis = 1)

In [18]:
yearly_CO2_mean = CO2_measurements.groupby(by = 'Year').mean()['CO2_average_ppm']

We would like to select `CO_average_ppm`, `seasonal_trend` and `Date` in order to start building a model. 

In [19]:
four_cols = df.iloc[:, [0, 1, 3, 5]].copy()
four_cols['Day'] = 15
four_cols['Date'] = four_cols.apply(lambda x: dt.date(year = int(x['Year']), 
                                                                    month = int(x['Month']), 
                                                                    day = int(x['Day'])), 
                                                 axis = 1)
four_cols = four_cols[['Date','CO2_average_ppm','seasonal_trend']]
four_cols_array = np.array(four_cols)
four_cols_array

array([[datetime.date(1958, 3, 15), 315.71, 314.62],
       [datetime.date(1958, 4, 15), 317.45, 315.29],
       [datetime.date(1958, 5, 15), 317.5, 314.71],
       ...,
       [datetime.date(2019, 3, 15), 411.97, 410.48],
       [datetime.date(2019, 4, 15), 413.32, 410.49],
       [datetime.date(2019, 5, 15), 414.66, 411.26]], dtype=object)

In [20]:
df2 = df.copy()
df2.CO2_average_ppm = np.where(df2.CO2_average_ppm == -99.99, np.nan, df2.CO2_average_ppm)
df2.head()

Unnamed: 0,Year,Month,Inferred_Date,CO2_average_ppm,Interpolated_CO2_ppm,seasonal_trend,days
0,1958,3,1958.208,315.71,315.71,314.62,-1
1,1958,4,1958.292,317.45,317.45,315.29,-1
2,1958,5,1958.375,317.5,317.5,314.71,-1
3,1958,6,1958.458,,317.1,314.85,-1
4,1958,7,1958.542,315.86,315.86,314.98,-1
