# Pandas Serisi Oluşturmak

In [5]:
import pandas as pd

In [3]:
pd.Series([1,2,3,4])

0    1
1    2
2    3
3    4
dtype: int64

In [4]:
seri = pd.Series([1,2,3,4])

In [5]:
type(seri)

pandas.core.series.Series

In [6]:
seri.axes

[RangeIndex(start=0, stop=4, step=1)]

In [7]:
seri.dtype

dtype('int64')

In [8]:
seri.size

4

In [9]:
seri.ndim

1

In [11]:
seri.values

array([1, 2, 3, 4])

In [19]:
seri.head()

0    1
1    2
2    3
3    4
dtype: int64

In [18]:
seri.head(3)

0    1
1    2
2    3
dtype: int64

In [21]:
seri.tail(2)

2    3
3    4
dtype: int64

In [22]:
# index isimlendirilmesi

In [23]:
pd.Series([1,2,5,6,8,85,4,87])

0     1
1     2
2     5
3     6
4     8
5    85
6     4
7    87
dtype: int64

In [27]:
pd.Series([1,2,5,6,8,85,4,87], index = [1,2,3,4,5,6,7,8])

1     1
2     2
3     5
4     6
5     8
6    85
7     4
8    87
dtype: int64

In [33]:
seri = pd.Series([1,2,5,6,8,85,4,87], index = ["a","b","c","d","e","f","y","m"])

In [36]:
seri["a"]

1

In [38]:
seri["a":"f"]

a     1
b     2
c     5
d     6
e     8
f    85
dtype: int64

In [39]:
#sozluk üzerinden liste olusturmak

In [7]:
sozluk = {"reg" : 10, "log" : 11, "cart": 12}

In [8]:
seri = pd.Series(sozluk)

In [9]:
seri

reg     10
log     11
cart    12
dtype: int64

In [10]:
# iki seriyi birlestirerek seri olusturma

In [11]:
pd.concat([seri , seri])

reg     10
log     11
cart    12
reg     10
log     11
cart    12
dtype: int64

# Eleman İşlemleri

In [12]:
import numpy as np
a = np.array([1,2,3,4,5])
seri = pd.Series(a)
seri

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [13]:
seri[0]

1

In [53]:
seri[0:3]

0    1
1    2
2    3
dtype: int64

In [56]:
seri = pd.Series([121,200,150,99], index = ["reg","loj","cart","rf"])

In [57]:
seri

reg     121
loj     200
cart    150
rf       99
dtype: int64

In [58]:
seri.index

Index(['reg', 'loj', 'cart', 'rf'], dtype='object')

In [59]:
seri.keys

<bound method Series.keys of reg     121
loj     200
cart    150
rf       99
dtype: int64>

In [61]:
list(seri.items())

[('reg', 121), ('loj', 200), ('cart', 150), ('rf', 99)]

In [62]:
seri.values

array([121, 200, 150,  99])

In [63]:
#eleman sorgulama

In [64]:
"reg" in seri

True

In [65]:
"a" in seri

False

In [66]:
seri["reg"]

121

In [67]:
#fancy eleman

In [68]:
seri[["rf", "reg"]]

rf      99
reg    121
dtype: int64

In [72]:
seri["reg"] = 130
seri["reg"]

130

In [75]:
seri["reg":"loj"]

reg    130
loj    200
dtype: int64

# Pandas DataFrame Oluşturma

In [77]:
#numpy sabit veri tipli veri yapısı olduğu için kategorik ve fix değişiklikte kolay olmuyor.
#veri analizlerinde ileri seviyelerde veri manipülasyonda ve veri analizlerinde numpy kullanışlı değildir. 
# Teorik yapılar için numpy
# Analitik anlamda Pandas, makine öğrenmesi modellerine vereceğimiz veri setleri olarak düşünebiliriz.

In [76]:
import pandas as pd

In [79]:
l = [1,2,3,4]
l

[1, 2, 3, 4]

In [80]:
pd.DataFrame(l, columns = ["degisken_ismi"])

Unnamed: 0,degisken_ismi
0,1
1,2
2,3
3,4


In [82]:
import numpy as np
m = np.arange(1,10).reshape((3,3))
m

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [83]:
pd.DataFrame(m, columns = ["var1", "var2","var3"])

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [84]:
#df isimlendirme

In [86]:
df = pd.DataFrame(m, columns = ["var1", "var2","var3"])
df.head()

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [87]:
df.columns = ("deg1", "deg2", "deg3")
df

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [88]:
type(df)

pandas.core.frame.DataFrame

In [89]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['deg1', 'deg2', 'deg3'], dtype='object')]

In [90]:
df.shape

(3, 3)

In [91]:
df.ndim

2

In [92]:
df.size

9

In [93]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [94]:
type(df.values)

numpy.ndarray

In [95]:
df.head()

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [97]:
df.tail(1)

Unnamed: 0,deg1,deg2,deg3
2,7,8,9


In [98]:
a = np.array([1,2,3,4,5])

In [99]:
pd.DataFrame(a, columns = ["deg1"])

Unnamed: 0,deg1
0,1
1,2
2,3
3,4
4,5


In [100]:
pd.DataFrame(a, columns = ["deg1"], index = [1,2,3,4,5])

Unnamed: 0,deg1
1,1
2,2
3,3
4,4
5,5


# Eleman İşlemleri

In [101]:
import numpy as np

In [134]:
s1 = np.random.randint(10, size = 5)
s2 = np.random.randint(10, size = 5)
s3 = np.random.randint(10, size = 5)

In [135]:
sozluk = {"var1": s1, "var2": s2, "var3": s3}

In [136]:
sozluk

{'var1': array([6, 0, 6, 8, 6]),
 'var2': array([3, 3, 5, 7, 0]),
 'var3': array([8, 2, 8, 6, 7])}

In [137]:
df = pd.DataFrame(sozluk)

In [138]:
df

Unnamed: 0,var1,var2,var3
0,6,3,8
1,0,3,2
2,6,5,8
3,8,7,6
4,6,0,7


In [139]:
df[0:1]

Unnamed: 0,var1,var2,var3
0,6,3,8


In [140]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [141]:
df.index = ["a","b","c","d","e"]

In [142]:
df

Unnamed: 0,var1,var2,var3
a,6,3,8
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [143]:
df[0:1]

Unnamed: 0,var1,var2,var3
a,6,3,8


In [144]:
df["a":"c"]

Unnamed: 0,var1,var2,var3
a,6,3,8
b,0,3,2
c,6,5,8


In [145]:
#silme

In [146]:
#sadece bu dataframe içinden silinir ana dosyadan silinmez
df.drop("a", axis = 0)

Unnamed: 0,var1,var2,var3
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [147]:
df

Unnamed: 0,var1,var2,var3
a,6,3,8
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [149]:
#inplace argümanı ile kalıcı olarak silinir
df.drop("a", axis = 0, inplace = True)

In [150]:
df

Unnamed: 0,var1,var2,var3
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [151]:
#fancy 

In [152]:
l = ["c","e"]

In [153]:
df.drop(l, axis = 0)

Unnamed: 0,var1,var2,var3
b,0,3,2
d,8,7,6


In [154]:
#degiskenler icin

In [155]:
df

Unnamed: 0,var1,var2,var3
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [156]:
"var1" in df

True

In [157]:
l = ["var1", "var4", "var2"]

In [158]:
for i in l:
    print(i in df)

True
False
True


In [159]:
df

Unnamed: 0,var1,var2,var3
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [161]:
df["var1"]

b    0
c    6
d    8
e    6
Name: var1, dtype: int64

In [176]:
df["var4"] = df["var1"] / df["var2"]

In [177]:
df

Unnamed: 0,var1,var2,var3,var4
b,0,3,2,0.0
c,6,5,8,1.2
d,8,7,6,1.142857
e,6,0,7,inf


In [175]:
#degisken silme

In [180]:
df.drop("var4", axis = 1)

Unnamed: 0,var1,var2,var3
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [168]:
df

Unnamed: 0,var1,var2,var3,var4
b,0,3,2,0.0
c,6,5,8,1.2
d,8,7,6,1.142857
e,6,0,7,inf


In [169]:
df.drop("var4", axis = 1, inplace = True)

In [170]:
df

Unnamed: 0,var1,var2,var3
b,0,3,2
c,6,5,8
d,8,7,6
e,6,0,7


In [171]:
l = ["var1","var2"]

In [172]:
df.drop(l, axis = 1)

Unnamed: 0,var3
b,2
c,8
d,6
e,7


# Gözlem ve Değişken Seçimi: loc & iloc

In [183]:
?np.random.randint

[0;31mDocstring:[0m
randint(low, high=None, size=None, dtype=int)

Return random integers from `low` (inclusive) to `high` (exclusive).

Return random integers from the "discrete uniform" distribution of
the specified dtype in the "half-open" interval [`low`, `high`). If
`high` is None (the default), then results are from [0, `low`).

.. note::
    New code should use the ``integers`` method of a ``default_rng()``
    instance instead; please see the :ref:`random-quick-start`.

Parameters
----------
low : int or array-like of ints
    Lowest (signed) integers to be drawn from the distribution (unless
    ``high=None``, in which case this parameter is one above the
    *highest* such integer).
high : int or array-like of ints, optional
    If provided, one above the largest (signed) integer to be drawn
    from the distribution (see above for behavior if ``high=None``).
    If array-like, must contain integer values
size : int or tuple of ints, optional
    Output shape.  If the given

In [182]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size=(10,3))
df = pd.DataFrame(m, columns =["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,5,9,9
1,17,8,9
2,28,2,26
3,2,28,13
4,24,21,27
5,14,12,21
6,22,23,9
7,22,19,21
8,28,16,10
9,29,27,25


In [184]:
#loc: tanımlandığı şekli ile seçim yapmak için kullanılır.

In [185]:
df.loc[0:3]

Unnamed: 0,var1,var2,var3
0,5,9,9
1,17,8,9
2,28,2,26
3,2,28,13


In [188]:
#iloc: alışık olduğumuz indeksleme mantığı ile seçim yapar. 'e kadar seçim yaparız

In [189]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,5,9,9
1,17,8,9
2,28,2,26


In [190]:
df.iloc[0,0]

5

In [191]:
df.iloc[:3,:2]

Unnamed: 0,var1,var2
0,5,9
1,17,8
2,28,2


In [194]:
df.loc[0:3,"var3"]
# indeksleme

0     9
1     9
2    26
3    13
Name: var3, dtype: int64

In [200]:
df.iloc[0:3,0:3]

Unnamed: 0,var1,var2,var3
0,5,9,9
1,17,8,9
2,28,2,26


In [201]:
df.iloc[0:3,0:3]["var3"]

0     9
1     9
2    26
Name: var3, dtype: int64

In [202]:
df["var1"]

0     5
1    17
2    28
3     2
4    24
5    14
6    22
7    22
8    28
9    29
Name: var1, dtype: int64

In [203]:
df[0:2]

Unnamed: 0,var1,var2,var3
0,5,9,9
1,17,8,9


In [204]:
df[0:2]["var1"]

0     5
1    17
Name: var1, dtype: int64

In [207]:
#fancy yardımıyla
df[0:2][["var1","var2"]]

Unnamed: 0,var1,var2
0,5,9
1,17,8


In [208]:
df

Unnamed: 0,var1,var2,var3
0,5,9,9
1,17,8,9
2,28,2,26
3,2,28,13
4,24,21,27
5,14,12,21
6,22,23,9
7,22,19,21
8,28,16,10
9,29,27,25


In [209]:
df.var1 

0     5
1    17
2    28
3     2
4    24
5    14
6    22
7    22
8    28
9    29
Name: var1, dtype: int64

In [210]:
df[df.var1 > 15 ]

Unnamed: 0,var1,var2,var3
1,17,8,9
2,28,2,26
4,24,21,27
6,22,23,9
7,22,19,21
8,28,16,10
9,29,27,25


In [211]:
df[df.var1 > 15 ]["var1"]

1    17
2    28
4    24
6    22
7    22
8    28
9    29
Name: var1, dtype: int64

In [212]:
df[df.var1 > 15 ]["var2"]

1     8
2     2
4    21
6    23
7    19
8    16
9    27
Name: var2, dtype: int64

In [215]:
df[(df.var1 > 15) & (df.var3 < 20)]

Unnamed: 0,var1,var2,var3
1,17,8,9
6,22,23,9
8,28,16,10


In [222]:
df.loc[(df.var1 > 15),["var1", "var2"]]

Unnamed: 0,var1,var2
1,17,8
2,28,2
4,24,21
6,22,23
7,22,19
8,28,16
9,29,27


In [219]:
(df.var1 > 15),["var1"]

(0    False
 1     True
 2     True
 3    False
 4     True
 5    False
 6     True
 7     True
 8     True
 9     True
 Name: var1, dtype: bool,
 ['var1'])

In [228]:
df[(df.var1 > 15)] [["var1", "var2"]]

Unnamed: 0,var1,var2
1,17,8
2,28,2
4,24,21
6,22,23
7,22,19
8,28,16
9,29,27


In [17]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (5,3))
df = pd.DataFrame(m , columns = ["var1", "var2", "var3"])
df

Unnamed: 0,var1,var2,var3
0,6,26,21
1,10,29,16
2,7,14,26
3,27,4,5
4,10,5,27


In [18]:
df2 = df + 99

In [19]:
df2

Unnamed: 0,var1,var2,var3
0,105,125,120
1,109,128,115
2,106,113,125
3,126,103,104
4,109,104,126


In [21]:
pd.concat([df,df2])

Unnamed: 0,var1,var2,var3
0,6,26,21
1,10,29,16
2,7,14,26
3,27,4,5
4,10,5,27
0,105,125,120
1,109,128,115
2,106,113,125
3,126,103,104
4,109,104,126


In [24]:
?pd.concat

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mobjs[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mIterable[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'NDFrame'[0m[0;34m)[0m[0;34m][0m[0;34m,[0m [0mMapping[0m[0;34m[[0m[0mUnion[0m[0;34m[[0m[0mHashable[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'NDFrame'[0m[0;34m)[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjoin[0m[0;34m=[0m[0;34m'outer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeys[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m

In [25]:
pd.concat([df,df2], ignore_index = True)

Unnamed: 0,var1,var2,var3
0,6,26,21
1,10,29,16
2,7,14,26
3,27,4,5
4,10,5,27
5,105,125,120
6,109,128,115
7,106,113,125
8,126,103,104
9,109,104,126


In [27]:
df.columns

Index(['var1', 'var2', 'var3'], dtype='object')

In [30]:
df2.columns = ["var1", "var2", "deg3"]

In [31]:
df2

Unnamed: 0,var1,var2,deg3
0,105,125,120
1,109,128,115
2,106,113,125
3,126,103,104
4,109,104,126


In [33]:
df

Unnamed: 0,var1,var2,var3
0,6,26,21
1,10,29,16
2,7,14,26
3,27,4,5
4,10,5,27


In [34]:
pd.concat([df,df2])

Unnamed: 0,var1,var2,var3,deg3
0,6,26,21.0,
1,10,29,16.0,
2,7,14,26.0,
3,27,4,5.0,
4,10,5,27.0,
0,105,125,,120.0
1,109,128,,115.0
2,106,113,,125.0
3,126,103,,104.0
4,109,104,,126.0


In [35]:
pd.concat([df,df2], join = "inner")

Unnamed: 0,var1,var2
0,6,26
1,10,29
2,7,14
3,27,4
4,10,5
0,105,125
1,109,128
2,106,113
3,126,103
4,109,104


# İleri Birleştirme İşlemleri

In [41]:
import pandas as pd

In [42]:
#birebir birleştirme

In [62]:
df1 = pd.DataFrame({"calisanlar" : ["Ali", "Veli", "Yusuf", "Behram"],
                   "departman" : ["Muhasebe", "Marketing", "Marketing", "Veri Bilimi"]})
df1

Unnamed: 0,calisanlar,departman
0,Ali,Muhasebe
1,Veli,Marketing
2,Yusuf,Marketing
3,Behram,Veri Bilimi


In [64]:
df2 = pd.DataFrame({"calisanlar" : ["Yusuf", "Ali", "Veli", "Behram"],
                   "ilk_giris" : ["2018", "2019", "2016", "2021"]})
df2

Unnamed: 0,calisanlar,ilk_giris
0,Yusuf,2018
1,Ali,2019
2,Veli,2016
3,Behram,2021


In [65]:
pd.merge(df1,df2)

Unnamed: 0,calisanlar,departman,ilk_giris
0,Ali,Muhasebe,2019
1,Veli,Marketing,2016
2,Yusuf,Marketing,2018
3,Behram,Veri Bilimi,2021


In [66]:
pd.merge(df1,df2, on = "calisanlar")

Unnamed: 0,calisanlar,departman,ilk_giris
0,Ali,Muhasebe,2019
1,Veli,Marketing,2016
2,Yusuf,Marketing,2018
3,Behram,Veri Bilimi,2021


In [53]:
#coktan teke

In [67]:
df3 = pd.merge(df1,df2)

In [68]:
df3

Unnamed: 0,calisanlar,departman,ilk_giris
0,Ali,Muhasebe,2019
1,Veli,Marketing,2016
2,Yusuf,Marketing,2018
3,Behram,Veri Bilimi,2021


In [69]:
df4 = pd.DataFrame({"mudur" : ["Bayram", "Behnaz", "Reis"],
                   "departman" : ["Muhasebe", "Marketing",  "Veri Bilimi"]})
df4

Unnamed: 0,mudur,departman
0,Bayram,Muhasebe
1,Behnaz,Marketing
2,Reis,Veri Bilimi


In [70]:
pd.merge(df3,df4)

Unnamed: 0,calisanlar,departman,ilk_giris,mudur
0,Ali,Muhasebe,2019,Bayram
1,Veli,Marketing,2016,Behnaz
2,Yusuf,Marketing,2018,Behnaz
3,Behram,Veri Bilimi,2021,Reis


In [71]:
#coktan coka

In [72]:
df5 = pd.DataFrame({"yetenekler" : ["Matematik", "Excel", "Swift", "Python", "R"],
                   "departman" : ["Muhasebe", "Marketing", "IOS Developer" ,"Veri Bilimi", "Veri Bilimi"]})
df5

Unnamed: 0,yetenekler,departman
0,Matematik,Muhasebe
1,Excel,Marketing
2,Swift,IOS Developer
3,Python,Veri Bilimi
4,R,Veri Bilimi


In [73]:
pd.merge(df1,df5)

Unnamed: 0,calisanlar,departman,yetenekler
0,Ali,Muhasebe,Matematik
1,Veli,Marketing,Excel
2,Yusuf,Marketing,Excel
3,Behram,Veri Bilimi,Python
4,Behram,Veri Bilimi,R


# Toplulaştırma ve Gruplama (Aggregation & Grouping)

Basit toplulaştırma fonksiyonları:

- count()
- first()
- last()
- mean()
- median()
- min()
- max()
- std()
- var()
- sum()

In [74]:
import seaborn as sns

In [78]:
?sns.load_dataset

[0;31mSignature:[0m [0msns[0m[0;34m.[0m[0mload_dataset[0m[0;34m([0m[0mname[0m[0;34m,[0m [0mcache[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mdata_home[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkws[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load an example dataset from the online repository (requires internet).

This function provides quick access to a small number of example datasets
that are useful for documenting seaborn or generating reproducible examples
for bug reports. It is not necessary for normal usage.

Note that some of the datasets have a small amount of preprocessing applied
to define a proper ordering for categorical variables.

Use :func:`get_dataset_names` to see a list of available datasets.

Parameters
----------
name : str
    Name of the dataset (``{name}.csv`` on
    https://github.com/mwaskom/seaborn-data).
cache : boolean, optional
    If True, try to load from the local cache first, and save to the cache
 

In [79]:
df = sns.load_dataset("planets")
df

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [80]:
df.head()


Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [81]:
df.shape

(1035, 6)

In [82]:
df.mean()

number               1.785507
orbital_period    2002.917596
mass                 2.638161
distance           264.069282
year              2009.070531
dtype: float64

In [83]:
df["mass"].mean()

2.6381605847953233

In [84]:
df["mass"].var()

14.58183312700122

In [85]:
df["mass"].sum()

1353.37638

In [86]:
df["mass"].min()

0.0036

In [87]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [88]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,1035.0,1.785507,1.240976,1.0,1.0,1.0,2.0,7.0
orbital_period,992.0,2002.917596,26014.728304,0.090706,5.44254,39.9795,526.005,730000.0
mass,513.0,2.638161,3.818617,0.0036,0.229,1.26,3.04,25.0
distance,808.0,264.069282,733.116493,1.35,32.56,55.25,178.5,8500.0
year,1035.0,2009.070531,3.972567,1989.0,2007.0,2010.0,2012.0,2014.0


In [89]:
df.dropna().describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,498.0,1.73494,1.17572,1.0,1.0,1.0,2.0,6.0
orbital_period,498.0,835.778671,1469.128259,1.3283,38.27225,357.0,999.6,17337.5
mass,498.0,2.50932,3.636274,0.0036,0.2125,1.245,2.8675,25.0
distance,498.0,52.068213,46.596041,1.35,24.4975,39.94,59.3325,354.0
year,498.0,2007.37751,4.167284,1989.0,2005.0,2009.0,2011.0,2014.0


# Gruplama İşlemleri (Grouping)

In [90]:
df = pd.DataFrame({"gruplar" : ["A", "B","C","A","B","C"],
                "veri" : [10,11,52,14,57,43]}, columns = ["gruplar", "veri"])
df

Unnamed: 0,gruplar,veri
0,A,10
1,B,11
2,C,52
3,A,14
4,B,57
5,C,43


In [91]:
df.groupby("gruplar")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9569a54760>

In [92]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,12.0
B,34.0
C,47.5


In [93]:
df.groupby("gruplar").sum()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,24
B,68
C,95


In [94]:
df = sns.load_dataset("planets")
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [95]:
df.groupby("method")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f9569a74dc0>

In [98]:
df.groupby("method")["orbital_period"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f9569a54460>

In [99]:
df.groupby("method")["orbital_period"].mean()

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [101]:
df.groupby("method")["orbital_period"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,631.18,544.217663,246.36,438.77,631.18,823.59,1016.0
Eclipse Timing Variations,9.0,4751.644444,2499.130945,1916.25,2900.0,4343.5,5767.0,10220.0
Imaging,12.0,118247.7375,213978.177277,4639.15,8343.9,27500.0,94250.0,730000.0
Microlensing,7.0,3153.571429,1113.166333,1825.0,2375.0,3300.0,3550.0,5100.0
Orbital Brightness Modulation,3.0,0.709307,0.725493,0.240104,0.291496,0.342887,0.943908,1.544929
Pulsar Timing,5.0,7343.021201,16313.265573,0.090706,25.262,66.5419,98.2114,36525.0
Pulsation Timing Variations,1.0,1170.0,,1170.0,1170.0,1170.0,1170.0,1170.0
Radial Velocity,553.0,823.35468,1454.92621,0.73654,38.021,360.2,982.0,17337.5
Transit,397.0,21.102073,46.185893,0.355,3.16063,5.714932,16.1457,331.60059
Transit Timing Variations,3.0,79.7835,71.599884,22.3395,39.67525,57.011,108.5055,160.0


In [103]:
import pandas as pd
df = pd.DataFrame({"gruplar" : ["A", "B","C","A","B","C"],
                        "veri1" : [10,11,52,14,57,43],
                        "veri2" : [99,61,32,44,70,22]}, 
                        columns = ["gruplar", "veri1", "veri2"])
df

Unnamed: 0,gruplar,veri1,veri2
0,A,10,99
1,B,11,61
2,C,52,32
3,A,14,44
4,B,57,70
5,C,43,22


In [104]:
#aggregate

In [106]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,veri1,veri2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,12.0,71.5
B,34.0,65.5
C,47.5,27.0


In [111]:
df.groupby("gruplar").aggregate(["min", np.median, max])

Unnamed: 0_level_0,veri1,veri1,veri1,veri2,veri2,veri2
Unnamed: 0_level_1,min,median,max,min,median,max
gruplar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,12.0,14,44,71.5,99
B,11,34.0,57,61,65.5,70
C,43,47.5,52,22,27.0,32


In [112]:
df.groupby("gruplar").aggregate({"veri1": "min", "veri2": "max"})

Unnamed: 0_level_0,veri1,veri2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,99
B,11,70
C,43,32


In [114]:
#filter

In [115]:
import pandas as pd
df = pd.DataFrame({"gruplar" : ["A", "B","C","A","B","C"],
                        "veri1" : [10,11,52,14,57,43],
                        "veri2" : [99,61,32,44,70,22]}, 
                        columns = ["gruplar", "veri1", "veri2"])
df

Unnamed: 0,gruplar,veri1,veri2
0,A,10,99
1,B,11,61
2,C,52,32
3,A,14,44
4,B,57,70
5,C,43,22


In [116]:
def filter_func(x):
    return x["veri1"].std() > 9

In [118]:
df.groupby("gruplar").std()

Unnamed: 0_level_0,veri1,veri2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.828427,38.890873
B,32.526912,6.363961
C,6.363961,7.071068


In [117]:
df.groupby("gruplar").filter(filter_func)

Unnamed: 0,gruplar,veri1,veri2
1,B,11,61
4,B,57,70


In [None]:
# Transform

In [119]:
import pandas as pd
df = pd.DataFrame({"gruplar" : ["A", "B","C","A","B","C"],
                        "veri1" : [10,11,52,14,57,43],
                        "veri2" : [99,61,32,44,70,22]}, 
                        columns = ["gruplar", "veri1", "veri2"])
df

Unnamed: 0,gruplar,veri1,veri2
0,A,10,99
1,B,11,61
2,C,52,32
3,A,14,44
4,B,57,70
5,C,43,22


In [120]:
df["veri1"]*9

0     90
1     99
2    468
3    126
4    513
5    387
Name: veri1, dtype: int64

In [128]:
df_a = df.iloc[:,1:3]

In [129]:
df_a.mean()

veri1    31.166667
veri2    54.666667
dtype: float64

In [126]:
df_a.transform(lambda x : x - x.mean())

Unnamed: 0,veri1,veri2
0,-21.166667,44.333333
1,-20.166667,6.333333
2,20.833333,-22.666667
3,-17.166667,-10.666667
4,25.833333,15.333333
5,11.833333,-32.666667


In [132]:
df_a.transform(lambda x : x - x.mean() / x.std())

Unnamed: 0,veri1,veri2
0,8.574718,97.051426
1,9.574718,59.051426
2,50.574718,30.051426
3,12.574718,42.051426
4,55.574718,68.051426
5,41.574718,20.051426


In [133]:
#apply

In [134]:
import pandas as pd
df = pd.DataFrame({
                        "veri1" : [10,11,52,14,57,43],
                        "veri2" : [99,61,32,44,70,22]}, 
                        columns = [ "veri1", "veri2"])
df

Unnamed: 0,veri1,veri2
0,10,99
1,11,61
2,52,32
3,14,44
4,57,70
5,43,22


In [135]:
df.apply(np.sum)

veri1    187
veri2    328
dtype: int64

In [136]:
df.apply(np.mean)

veri1    31.166667
veri2    54.666667
dtype: float64

In [137]:
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [139]:
#hayatta kalmak 1, ölmek 0 ile gösterilmektedir.
titanic.groupby("sex")["survived"].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [140]:
titanic.groupby("sex")[["survived"]].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [146]:
titanic.groupby(["sex", "class"])[["survived"]].aggregate("mean").unstack()

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [147]:
#pivot ile table

In [151]:
titanic.pivot_table("survived" , index = "sex", columns = "class")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [152]:
titanic.pivot_table("survived" , index = "who", columns = "class")

class,First,Second,Third
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
child,0.833333,1.0,0.431034
man,0.352941,0.080808,0.119122
woman,0.978022,0.909091,0.491228


In [153]:
titanic.age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [159]:
age = pd.cut(titanic["age"], [0,18,90])
age.head(10)

0    (18.0, 90.0]
1    (18.0, 90.0]
2    (18.0, 90.0]
3    (18.0, 90.0]
4    (18.0, 90.0]
5             NaN
6    (18.0, 90.0]
7     (0.0, 18.0]
8    (18.0, 90.0]
9     (0.0, 18.0]
Name: age, dtype: category
Categories (2, interval[int64]): [(0, 18] < (18, 90]]

In [158]:
titanic.pivot_table("survived", ["sex", age], "class")

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 20]",0.928571,1.0,0.510638
female,"(20, 90]",0.971831,0.896552,0.418182
male,"(0, 20]",0.571429,0.526316,0.197368
male,"(20, 90]",0.382979,0.0625,0.129944


# Dış kaynaklı Veri okumak

In [162]:
pd.read_csv("reading_data/ornekcsv.csv", sep = ";")

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [163]:
#txt okumak
pd.read_csv("reading_data/duz_metin.txt")

Unnamed: 0,1 2
0,2 2
1,3 2
2,4 2
3,5 2
4,6 2
5,7 2
6,8 2
7,9 2
8,10 2


In [167]:
#excel okumak
df = pd.read_excel("reading_data/ornekx.xlsx")
df

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [168]:
df.head()

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
