In [1]:
import pandas as pd
import numpy as np

In [2]:
data = [1531, 4684 ,8644 ,3232, 65654 , 9779]

In [3]:
data_series = pd.Series(data, name = 'numbers')

In [4]:
data_series

0     1531
1     4684
2     8644
3     3232
4    65654
5     9779
Name: numbers, dtype: int64

In [5]:
dates = pd.date_range('20150305', periods = 6)

In [6]:
dates

DatetimeIndex(['2015-03-05', '2015-03-06', '2015-03-07', '2015-03-08',
               '2015-03-09', '2015-03-10'],
              dtype='datetime64[ns]', freq='D')

In [7]:
data_series.index = pd.date_range('20160508', periods = 6) #adding indices

In [8]:
data_series['2016-05-10']

8644

In [9]:
data_series['2016']

2016-05-08     1531
2016-05-09     4684
2016-05-10     8644
2016-05-11     3232
2016-05-12    65654
2016-05-13     9779
Freq: D, Name: numbers, dtype: int64

In [10]:
data_series[2]

8644

In [11]:
#data conversion

In [12]:
data_series.dtypes

dtype('int64')

In [13]:
data_series = data_series.astype(np.float)

In [14]:
data_series.dtypes

dtype('float64')

In [15]:
#invalid data fixing 

In [16]:
data_series[1:3] = np.NaN

In [17]:
data_series

2016-05-08     1531.0
2016-05-09        NaN
2016-05-10        NaN
2016-05-11     3232.0
2016-05-12    65654.0
2016-05-13     9779.0
Freq: D, Name: numbers, dtype: float64

In [18]:
data_series = data_series.fillna(0)

In [19]:
data_series #NaN values replaced with zero

2016-05-08     1531.0
2016-05-09        0.0
2016-05-10        0.0
2016-05-11     3232.0
2016-05-12    65654.0
2016-05-13     9779.0
Freq: D, Name: numbers, dtype: float64

In [20]:
cycling_data = [10.7, 3.5, None, None, 2.4, 1.02, 4, 6]

In [21]:
df = pd.DataFrame(list(zip(cycling_data, data_series))) #creating dfs out of pd series and lists

In [22]:
df

Unnamed: 0,0,1
0,10.7,1531.0
1,3.5,0.0
2,,0.0
3,,3232.0
4,2.4,65654.0
5,1.02,9779.0


In [23]:
df = pd.DataFrame(list(zip(cycling_data, data_series)), index = pd.date_range('20150605', periods = 6), columns = ['X','Y'])
#creating dfs out of pd series and lists, plus options

In [24]:
df

Unnamed: 0,X,Y
2015-06-05,10.7,1531.0
2015-06-06,3.5,0.0
2015-06-07,,0.0
2015-06-08,,3232.0
2015-06-09,2.4,65654.0
2015-06-10,1.02,9779.0


In [25]:
df.fillna(0)

Unnamed: 0,X,Y
2015-06-05,10.7,1531.0
2015-06-06,3.5,0.0
2015-06-07,0.0,0.0
2015-06-08,0.0,3232.0
2015-06-09,2.4,65654.0
2015-06-10,1.02,9779.0


In [26]:
#loc and iloc, classic

In [27]:
df.iloc[1]

X    3.5
Y    0.0
Name: 2015-06-06 00:00:00, dtype: float64

In [28]:
df.loc['2015']

Unnamed: 0,X,Y
2015-06-05,10.7,1531.0
2015-06-06,3.5,0.0
2015-06-07,,0.0
2015-06-08,,3232.0
2015-06-09,2.4,65654.0
2015-06-10,1.02,9779.0


In [29]:
df['X']

2015-06-05    10.70
2015-06-06     3.50
2015-06-07      NaN
2015-06-08      NaN
2015-06-09     2.40
2015-06-10     1.02
Freq: D, Name: X, dtype: float64

In [30]:
df.X

2015-06-05    10.70
2015-06-06     3.50
2015-06-07      NaN
2015-06-08      NaN
2015-06-09     2.40
2015-06-10     1.02
Freq: D, Name: X, dtype: float64

In [31]:
df.iloc[:,0] #X

2015-06-05    10.70
2015-06-06     3.50
2015-06-07      NaN
2015-06-08      NaN
2015-06-09     2.40
2015-06-10     1.02
Freq: D, Name: X, dtype: float64

In [32]:
iris_df = pd.read_csv('c:/Users/kais/Desktop/INTEL DATA/data/Iris_Data.csv')

In [33]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [34]:
iris_df['sepal_area'] = iris_df.sepal_length * iris_df.sepal_width #assigning new data to a df

In [35]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area
0,5.1,3.5,1.4,0.2,Iris-setosa,17.85
1,4.9,3.0,1.4,0.2,Iris-setosa,14.7
2,4.7,3.2,1.3,0.2,Iris-setosa,15.04
3,4.6,3.1,1.5,0.2,Iris-setosa,14.26
4,5.0,3.6,1.4,0.2,Iris-setosa,18.0


In [36]:
#applying a function to a df col

In [37]:
iris_df['abbrev'] = (iris_df.species.apply(lambda x:x.replace('Iris-','')))

In [38]:
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,abbrev
0,5.1,3.5,1.4,0.2,Iris-setosa,17.85,setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,14.70,setosa
2,4.7,3.2,1.3,0.2,Iris-setosa,15.04,setosa
3,4.6,3.1,1.5,0.2,Iris-setosa,14.26,setosa
4,5.0,3.6,1.4,0.2,Iris-setosa,18.00,setosa
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,20.10,virginica
146,6.3,2.5,5.0,1.9,Iris-virginica,15.75,virginica
147,6.5,3.0,5.2,2.0,Iris-virginica,19.50,virginica
148,6.2,3.4,5.4,2.3,Iris-virginica,21.08,virginica


In [39]:
#concat two dfs, basically adding rows to each other 

In [40]:
small_data = pd.concat([iris_df.iloc[:2], iris_df.iloc[-2:]])

In [41]:
small_data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,abbrev
0,5.1,3.5,1.4,0.2,Iris-setosa,17.85,setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,14.7,setosa
148,6.2,3.4,5.4,2.3,Iris-virginica,21.08,virginica
149,5.9,3.0,5.1,1.8,Iris-virginica,17.7,virginica


In [42]:
#aggregation functions

In [43]:
iris_df.groupby('species').size()

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

In [44]:
#statistics

In [45]:
iris_df.mean()

sepal_length     5.843333
sepal_width      3.054000
petal_length     3.758667
petal_width      1.198667
sepal_area      17.806533
dtype: float64

In [46]:
iris_df.sum()

sepal_length                                                876.5
sepal_width                                                 458.1
petal_length                                                563.8
petal_width                                                 179.8
species         Iris-setosaIris-setosaIris-setosaIris-setosaIr...
sepal_area                                                2670.98
abbrev          setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [47]:
iris_df.std()

sepal_length    0.828066
sepal_width     0.433594
petal_length    1.764420
petal_width     0.763161
sepal_area      3.368693
dtype: float64

In [48]:
iris_df.var()

sepal_length     0.685694
sepal_width      0.188004
petal_length     3.113179
petal_width      0.582414
sepal_area      11.348090
dtype: float64

In [49]:
iris_df.median()

sepal_length     5.80
sepal_width      3.00
petal_length     4.35
petal_width      1.30
sepal_area      17.66
dtype: float64

In [50]:
iris_df.mode()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,abbrev
0,5.0,3.0,1.5,0.2,Iris-setosa,15.66,setosa
1,,,,,Iris-versicolor,,versicolor
2,,,,,Iris-virginica,,virginica


In [51]:
iris_df.petal_length.median()

4.35

In [52]:
iris_df.quantile(0)

sepal_length     4.3
sepal_width      2.0
petal_length     1.0
petal_width      0.1
sepal_area      10.0
Name: 0, dtype: float64

In [53]:
iris_df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,sepal_area
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,17.806533
std,0.828066,0.433594,1.76442,0.763161,3.368693
min,4.3,2.0,1.0,0.1,10.0
25%,5.1,2.8,1.6,0.3,15.645
50%,5.8,3.0,4.35,1.3,17.66
75%,6.4,3.3,5.1,1.8,20.325
max,7.9,4.4,6.9,2.5,30.02


In [62]:
sample = iris_df.sample(5, replace = False, random_state = 42)

In [63]:
sample

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,abbrev
73,6.1,2.8,4.7,1.2,Iris-versicolor,17.08,versicolor
18,5.7,3.8,1.7,0.3,Iris-setosa,21.66,setosa
118,7.7,2.6,6.9,2.3,Iris-virginica,20.02,virginica
78,6.0,2.9,4.5,1.5,Iris-versicolor,17.4,versicolor
76,6.8,2.8,4.8,1.4,Iris-versicolor,19.04,versicolor


In [64]:
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,abbrev
0,5.1,3.5,1.4,0.2,Iris-setosa,17.85,setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,14.70,setosa
2,4.7,3.2,1.3,0.2,Iris-setosa,15.04,setosa
3,4.6,3.1,1.5,0.2,Iris-setosa,14.26,setosa
4,5.0,3.6,1.4,0.2,Iris-setosa,18.00,setosa
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,20.10,virginica
146,6.3,2.5,5.0,1.9,Iris-virginica,15.75,virginica
147,6.5,3.0,5.2,2.0,Iris-virginica,19.50,virginica
148,6.2,3.4,5.4,2.3,Iris-virginica,21.08,virginica
