# Preprocessing

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('GlobalTemperatures.csv', parse_dates=['dt'])
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3192 entries, 0 to 3191
Data columns (total 9 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   dt                                         3192 non-null   datetime64[ns]
 1   LandAverageTemperature                     3180 non-null   float64       
 2   LandAverageTemperatureUncertainty          3180 non-null   float64       
 3   LandMaxTemperature                         1992 non-null   float64       
 4   LandMaxTemperatureUncertainty              1992 non-null   float64       
 5   LandMinTemperature                         1992 non-null   float64       
 6   LandMinTemperatureUncertainty              1992 non-null   float64       
 7   LandAndOceanAverageTemperature             1992 non-null   float64       
 8   LandAndOceanAverageTemperatureUncertainty  1992 non-null   float64       
dtypes: datetime64[ns](1)

### Wypełnienie brakujących danych średnią wartością

In [9]:
data = data.apply(lambda x: x.fillna(x.mean()), axis = 0)

TypeError: unsupported operand type(s) for +: 'Timestamp' and 'Timestamp'

In [11]:
data.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


#### Sprawdzenie efektów wypełnienia brakujących danych

- wypłenienie brakujących danych przebiegło efektywnie 
- nastąpiły zmiany w wartościach opisu statystycznego danych
- nastąpiły zmiany w histogramach

In [12]:
data.describe()

Unnamed: 0,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
count,3180.0,3180.0,1992.0,1992.0,1992.0,1992.0,1992.0,1992.0
mean,8.374731,0.938468,14.350601,0.479782,2.743595,0.431849,15.212566,0.128532
std,4.38131,1.09644,4.309579,0.583203,4.155835,0.445838,1.274093,0.073587
min,-2.08,0.034,5.9,0.044,-5.407,0.045,12.475,0.042
25%,4.312,0.18675,10.212,0.142,-1.3345,0.155,14.047,0.063
50%,8.6105,0.392,14.76,0.252,2.9495,0.279,15.251,0.122
75%,12.54825,1.41925,18.4515,0.539,6.77875,0.45825,16.39625,0.151
max,19.021,7.88,21.32,4.373,9.715,3.498,17.611,0.457


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3192 entries, 0 to 3191
Data columns (total 9 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   dt                                         3192 non-null   datetime64[ns]
 1   LandAverageTemperature                     3180 non-null   float64       
 2   LandAverageTemperatureUncertainty          3180 non-null   float64       
 3   LandMaxTemperature                         1992 non-null   float64       
 4   LandMaxTemperatureUncertainty              1992 non-null   float64       
 5   LandMinTemperature                         1992 non-null   float64       
 6   LandMinTemperatureUncertainty              1992 non-null   float64       
 7   LandAndOceanAverageTemperature             1992 non-null   float64       
 8   LandAndOceanAverageTemperatureUncertainty  1992 non-null   float64       
dtypes: datetime64[ns](1)

In [14]:
data.isna().sum()

dt                                              0
LandAverageTemperature                         12
LandAverageTemperatureUncertainty              12
LandMaxTemperature                           1200
LandMaxTemperatureUncertainty                1200
LandMinTemperature                           1200
LandMinTemperatureUncertainty                1200
LandAndOceanAverageTemperature               1200
LandAndOceanAverageTemperatureUncertainty    1200
dtype: int64

### Kodowanie 

W naszym przypadku nie ma danych kategorycznych, dlatego nie ma potrzeby stosowania kodowania ich na dane numeryczne.

### Normalizacja 

Ze względu na to, że wartości cech (z wykluczeniem daty pomiaru) sięgają skali ujemnej, odpowiednim zabiegiem dla naszego zestawu danych będzie normalizacja do przedziału od -1 do 1 celem ujednolicenia wszystkich atrybutów.

In [15]:
def normalize(data):
    normData = 2 * ((data - data.min())/(data.max() - data.min())) - 1
    normData["dt"] = data["dt"]
    return normData

In [16]:
data = normalize(data)
%store data
data.head()

Stored 'data' (DataFrame)


Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,-0.515284,-0.097629,,,,,,
1,1750-02-01,-0.510639,-0.065001,,,,,,
2,1750-03-01,-0.269608,-0.224573,,,,,,
3,1750-04-01,0.001848,-0.38389,,,,,,
4,1750-05-01,0.294062,-0.4805,,,,,,


In [None]:
data.describe()

## Analiza danych po wstępnym przetworzeniu danych

In [None]:
data.hist(figsize = (25, 25), alpha = 0.8)

In [None]:
# pip install ipynb
from ipynb.fs.full.analysis import makeThreePlot

In [None]:
tempByYear = data.groupby(data.dt.dt.year).mean()

In [None]:
makeThreePlot(tempByYear, 'Land temperature between 1750 and 2015', 'year')

In [None]:
tempByMonth = data.groupby(data.dt.dt.month).mean()

In [None]:
makeThreePlot(tempByMonth, 'Land temperature per month', 'month')