# Pandas Serisi Oluşturmak

In [1]:
import pandas as pd

In [3]:
pd.Series([1,2,3,4,5]) # numpy arraylerinden farkı indekslerle birlikte verilmesi

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
seri = pd.Series([1,2,3,4,5])

In [5]:
type(seri)

pandas.core.series.Series

In [7]:
seri.axes #Serinin indeks bilgilerini erişir

[RangeIndex(start=0, stop=5, step=1)]

In [10]:
seri.dtype

dtype('int64')

In [9]:
seri.size

5

In [11]:
seri.ndim

1

In [12]:
seri.values #numpy gibi vektör olarak erişmek istersem

array([1, 2, 3, 4, 5], dtype=int64)

In [13]:
seri.head(3) #serinin ilk 3 gözlemine erişmek için

0    1
1    2
2    3
dtype: int64

In [14]:
seri.tail(3) #serinin elemanlarına sondan bakmak istersek örn son 3 elaman

2    3
3    4
4    5
dtype: int64

In [15]:
#indeks isimlendirmesi

In [16]:
pd.Series([11,75,36,45,85])

0    11
1    75
2    36
3    45
4    85
dtype: int64

In [17]:
#indeks isimlendirmelerini kendim belirlemek istiyorum dersek

In [18]:
pd.Series([11,75,36,45,85], index = [1,3,5,7,9])

1    11
3    75
5    36
7    45
9    85
dtype: int64

In [19]:
#indeks isimlendirmesini int değil de string olarak vermek için

In [20]:
pd.Series([11,75,36,45,85], index = ["a","b","c","d","e"])

a    11
b    75
c    36
d    45
e    85
dtype: int64

In [21]:
seri = pd.Series([11,75,36,45,85], index = ["a","b","c","d","e"])

In [22]:
seri["a"]

11

In [23]:
seri["a":"c"]

a    11
b    75
c    36
dtype: int64

In [24]:
#Sozluk üzerinden liste olusturmak

In [25]:
sozluk = {"reg" : 10, "log": 11, "cart": 12}

In [26]:
seri = pd.Series(sozluk)

In [27]:
seri

reg     10
log     11
cart    12
dtype: int64

In [33]:
#iki seriyi birlestirerek seri olusturma

In [34]:
a = pd.Series([1,2,3])
b = pd.Series([4,5,6])

In [36]:
pd.concat([a,b])

0    1
1    2
2    3
0    4
1    5
2    6
dtype: int64

# Eleman İşlemleri

In [37]:
import numpy as np
a = np.array([1,2,53,486,54])
seri = pd.Series(a)
seri

0      1
1      2
2     53
3    486
4     54
dtype: int32

In [38]:
seri[0]

1

In [40]:
seri[0:3]

0     1
1     2
2    53
dtype: int32

In [41]:
seri = pd.Series([11,14,15,19], index = ["cey","bu","hüs","ar"])

In [42]:
seri

cey    11
bu     14
hüs    15
ar     19
dtype: int64

In [43]:
seri.index

Index(['cey', 'bu', 'hüs', 'ar'], dtype='object')

In [44]:
seri.keys

<bound method Series.keys of cey    11
bu     14
hüs    15
ar     19
dtype: int64>

In [49]:
list(seri.items()) # key-value şeklinde gelmesini istiyorsak

[('cey', 11), ('bu', 14), ('hüs', 15), ('ar', 19)]

In [50]:
tuple(seri.items())

(('cey', 11), ('bu', 14), ('hüs', 15), ('ar', 19))

In [51]:
seri.values

array([11, 14, 15, 19], dtype=int64)

In [52]:
#eleman sorgulama

In [54]:
"cey" in seri # serinin içinde mi?

True

In [55]:
"reg" in seri

False

In [56]:
#fancy eleman

In [59]:
seri[["cey", "bu"]]

cey    11
bu     14
dtype: int64

In [60]:
seri["hüs"] = 19

In [61]:
seri

cey    11
bu     14
hüs    19
ar     19
dtype: int64

In [62]:
#Slice ile fancy

In [65]:
seri["cey":"ar"]

cey    11
bu     14
hüs    19
ar     19
dtype: int64

# Pandas DataFrame Oluşturma

In [7]:
import pandas as pd

In [8]:
l = [11,75,46,635,63]

In [9]:
l

[11, 75, 46, 635, 63]

In [10]:
pd.DataFrame(l, columns = ["degisken_ismi"])

Unnamed: 0,degisken_ismi
0,11
1,75
2,46
3,635
4,63


In [11]:
import numpy as np
m = np.arange(1,10).reshape((3,3))
m

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [12]:
pd.DataFrame(m, columns = ["var1","var2","var3"])

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [13]:
#df isimlendirme

In [14]:
df = pd.DataFrame(m, columns = ["var1","var2","var3"])
df.head()

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [15]:
df.columns # degisken isimlerini getirir

Index(['var1', 'var2', 'var3'], dtype='object')

In [16]:
df.columns = ("deg1","deg2","deg3")

In [17]:
df

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [18]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['deg1', 'deg2', 'deg3'], dtype='object')]

In [19]:
df.ndim

2

In [20]:
df.size

9

In [21]:
df.shape

(3, 3)

In [22]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [23]:
type(df.values)

numpy.ndarray

In [24]:
df.head()

Unnamed: 0,deg1,deg2,deg3
0,1,2,3
1,4,5,6
2,7,8,9


In [25]:
df.tail(1)

Unnamed: 0,deg1,deg2,deg3
2,7,8,9


In [26]:
a = np.array([1,2,3,4,5])
a

array([1, 2, 3, 4, 5])

In [27]:
pd.DataFrame(a, columns = ["deg1"])

Unnamed: 0,deg1
0,1
1,2
2,3
3,4
4,5


In [28]:
b = pd.Series([1,2,3,4,5,6])
b

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [29]:
pd.DataFrame(b, columns = ["degisken"])

Unnamed: 0,degisken
0,1
1,2
2,3
3,4
4,5
5,6


# DataFrame Eleman Islemleri 

In [30]:
import numpy as np
s1 = np.random.randint(10, size = 5)
s2 = np.random.randint(10, size = 5)
s3 = np.random.randint(10, size = 5)

In [31]:
pd.DataFrame(s1, columns = ["s1"])

Unnamed: 0,s1
0,5
1,0
2,7
3,9
4,1


In [32]:
sozluk = {"var1": s1, "var2": s2, "var3":s3} #Sozluk üzerinden df oluşturma

In [33]:
sozluk

{'var1': array([5, 0, 7, 9, 1]),
 'var2': array([4, 4, 3, 5, 3]),
 'var3': array([4, 8, 8, 0, 0])}

In [48]:
df = pd.DataFrame(sozluk)
df

Unnamed: 0,var1,var2,var3
0,5,4,4
1,0,4,8
2,7,3,8
3,9,5,0
4,1,3,0


In [49]:
df[0:1]

Unnamed: 0,var1,var2,var3
0,5,4,4


In [50]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [51]:
df.index = ["a","b","c","d","e"]

In [52]:
df

Unnamed: 0,var1,var2,var3
a,5,4,4
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [53]:
df["c":"e"]

Unnamed: 0,var1,var2,var3
c,7,3,8
d,9,5,0
e,1,3,0


In [54]:
#Silme 

In [55]:
df.drop("a", axis = 0) # axis 0 yapıyoruz ki satırı almak istediğimiz bilgisini pandasa vermiş olalım

Unnamed: 0,var1,var2,var3
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [56]:
df

Unnamed: 0,var1,var2,var3
a,5,4,4
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [57]:
df.drop("a", axis = 0, inplace = True) # inplace: df üzerinden yapılan işlemin kalıcı olmasını sağlar

In [58]:
df

Unnamed: 0,var1,var2,var3
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [59]:
#fancy ile silme ("c" ve "e" satırını)

In [60]:
l = ["c","e"]

In [61]:
df.drop(l, axis = 0)

Unnamed: 0,var1,var2,var3
b,0,4,8
d,9,5,0


In [62]:
df

Unnamed: 0,var1,var2,var3
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [63]:
#degiskenler için

In [64]:
"var1" in df

True

In [65]:
l = ["var1", "var4", "var2"]

In [66]:
for i in l:
    print(i in df)

True
False
True


In [67]:
df

Unnamed: 0,var1,var2,var3
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [74]:
df["var1"]

b    0
c    7
d    9
e    1
Name: var1, dtype: int32

In [73]:
df["var4"] = df["var1"] * df["var2"]

In [75]:
df

Unnamed: 0,var1,var2,var3,var4
b,0,4,8,0
c,7,3,8,21
d,9,5,0,45
e,1,3,0,3


In [76]:
#degisken silme

In [77]:
df.drop("var4", axis = 1)

Unnamed: 0,var1,var2,var3
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [78]:
df

Unnamed: 0,var1,var2,var3,var4
b,0,4,8,0
c,7,3,8,21
d,9,5,0,45
e,1,3,0,3


In [79]:
df.drop("var4", axis = 1, inplace = True)

In [80]:
df

Unnamed: 0,var1,var2,var3
b,0,4,8
c,7,3,8
d,9,5,0
e,1,3,0


In [81]:
# fancy ile silme

In [82]:
l = ["var1","var2"]

In [83]:
df.drop(l, axis = 1)

Unnamed: 0,var3
b,8
c,8
d,0
e,0


# Gözlem ve Değişken Seçimi: loc & iloc

In [2]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (10,3))
df = pd.DataFrame(m, columns = ["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,20,21,2
1,15,15,13
2,21,4,16
3,26,29,10
4,10,3,7
5,1,25,28
6,16,23,4
7,2,27,18
8,18,27,10
9,9,17,12


In [3]:
#loc: tanımlandığı şekliyle seçim yapmak için kullanılır.

In [4]:
df.loc[0:3]

Unnamed: 0,var1,var2,var3
0,20,21,2
1,15,15,13
2,21,4,16
3,26,29,10


In [5]:
#iloc: alışık olduğumuz indeksleme mantığıyla seçim yapar.

In [6]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,20,21,2
1,15,15,13
2,21,4,16


In [7]:
df.iloc[:3,:2]

Unnamed: 0,var1,var2
0,20,21
1,15,15
2,21,4


In [8]:
df.loc[0:3, "var3"]

0     2
1    13
2    16
3    10
Name: var3, dtype: int32

In [11]:
#eğer değişken ya da gözlem birimleri yani satırlarla ilgili mutlak
#bir değer işaretlemesi yapacaksak "loc" kullanmamız gerekiyor.

#eğer ben bunlara takılmıyorum alışık olduğum bi indeksli seçim yaklaşımım
#var dersek "iloc" kullanıcaz.

#özetle eğer verilen kurallara göre seçim yapılma işlemi varsa gözlem ya da
#değisken isimlendirmeleri açısından "loc" yapısı kullanır.
#eğer verilen isimlendirmelerden bağımsız ben bunlara takılmıyorum klasik
#index mantığıyla seçim yapmak istiyorum dersek de "iloc" kullanılır

In [10]:
df.iloc[0:,]["var3"]

0     2
1    13
2    16
3    10
4     7
5    28
6     4
7    18
8    10
9    12
Name: var3, dtype: int32

In [12]:
df

Unnamed: 0,var1,var2,var3
0,20,21,2
1,15,15,13
2,21,4,16
3,26,29,10
4,10,3,7
5,1,25,28
6,16,23,4
7,2,27,18
8,18,27,10
9,9,17,12


In [13]:
df["var1"]

0    20
1    15
2    21
3    26
4    10
5     1
6    16
7     2
8    18
9     9
Name: var1, dtype: int32

In [16]:
df[0:2] [["var1","var2"]]

Unnamed: 0,var1,var2
0,20,21
1,15,15


In [19]:
df[df.var1 > 15]["var1"]

0    20
2    21
3    26
6    16
8    18
Name: var1, dtype: int32

In [32]:
df[(df.var1>15) & (df.var2<1)]

Unnamed: 0,var1,var2,var3


In [37]:
df.loc[(df.var1>15), ["var1","var2"]]

Unnamed: 0,var1,var2
0,20,21
2,21,4
3,26,29
6,16,23
8,18,27


In [41]:
df[(df.var1>15)] [["var1","var2"]] #loc kullanmadak yapmak istersek

Unnamed: 0,var1,var2
0,20,21
2,21,4
3,26,29
6,16,23
8,18,27


# Birleştirme (Join) İşlemleri

In [2]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (5,3))
df1 = pd.DataFrame(m, columns = ["var1","var2","var3"])
df1

Unnamed: 0,var1,var2,var3
0,19,26,2
1,29,23,16
2,12,29,20
3,2,25,15
4,19,19,11


In [3]:
df2 = df1 + 99

In [4]:
df2

Unnamed: 0,var1,var2,var3
0,118,125,101
1,128,122,115
2,111,128,119
3,101,124,114
4,118,118,110


In [5]:
pd.concat([df1,df2])

Unnamed: 0,var1,var2,var3
0,19,26,2
1,29,23,16
2,12,29,20
3,2,25,15
4,19,19,11
0,118,125,101
1,128,122,115
2,111,128,119
3,101,124,114
4,118,118,110


In [6]:
pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,var1,var2,var3
0,19,26,2
1,29,23,16
2,12,29,20
3,2,25,15
4,19,19,11
5,118,125,101
6,128,122,115
7,111,128,119
8,101,124,114
9,118,118,110


In [7]:
df1.columns

Index(['var1', 'var2', 'var3'], dtype='object')

In [8]:
df2.columns = ["var1","var2","deg3"]

In [9]:
df2.columns

Index(['var1', 'var2', 'deg3'], dtype='object')

In [10]:
pd.concat([df1,df2])

Unnamed: 0,var1,var2,var3,deg3
0,19,26,2.0,
1,29,23,16.0,
2,12,29,20.0,
3,2,25,15.0,
4,19,19,11.0,
0,118,125,,101.0
1,128,122,,115.0
2,111,128,,119.0
3,101,124,,114.0
4,118,118,,110.0


In [11]:
pd.concat([df1,df2], join = "inner") #kesisimlerini getirdi

Unnamed: 0,var1,var2
0,19,26
1,29,23
2,12,29
3,2,25
4,19,19
0,118,125
1,128,122
2,111,128
3,101,124
4,118,118


In [21]:
pd.concat([df1, df2], join_axes = [df2.columns], ignore_index: True)

SyntaxError: positional argument follows keyword argument (267727474.py, line 1)

# İleri Birleştirme İşlemleri

In [22]:
import pandas as pd

In [23]:
#birebir birleştirme

In [45]:
df1 = pd.DataFrame({'calisanlar': ['Ceyda','Bugra','Huseyin','Can'],
                   'grup':['Muhasebe','Muhendislik','Muhendislik','İK']})
df1

Unnamed: 0,calisanlar,grup
0,Ceyda,Muhasebe
1,Bugra,Muhendislik
2,Huseyin,Muhendislik
3,Can,İK


In [46]:
df2 = pd.DataFrame({'calisanlar': ['Ceyda','Bugra','Huseyin','Can'],
                   'ilk_giris': [2010,2009,2014,2019]})
df2

Unnamed: 0,calisanlar,ilk_giris
0,Ceyda,2010
1,Bugra,2009
2,Huseyin,2014
3,Can,2019


In [47]:
pd.merge(df1, df2)

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ceyda,Muhasebe,2010
1,Bugra,Muhendislik,2009
2,Huseyin,Muhendislik,2014
3,Can,İK,2019


In [48]:
pd.merge(df1, df2, on = 'calisanlar')

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ceyda,Muhasebe,2010
1,Bugra,Muhendislik,2009
2,Huseyin,Muhendislik,2014
3,Can,İK,2019


In [49]:
#çoktan teke

In [50]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ceyda,Muhasebe,2010
1,Bugra,Muhendislik,2009
2,Huseyin,Muhendislik,2014
3,Can,İK,2019


In [51]:
df4 = pd.DataFrame({'grup':['Mühendislik','Tıp','İK'],
                   'mudur': ['Ceyda','Bugra','Huseyin']})
df4

Unnamed: 0,grup,mudur
0,Mühendislik,Ceyda
1,Tıp,Bugra
2,İK,Huseyin


In [52]:
pd.merge(df3, df4)

Unnamed: 0,calisanlar,grup,ilk_giris,mudur
0,Can,İK,2019,Huseyin


In [53]:
#çoktan çoka

In [54]:
df5 = pd.DataFrame({'grup': ['Muhasebe','Muhasebe','Muhendislik','IK','İK','Muhendislik'],
                   'yetenekler': ['matematik','excel','kodlama','excel','yönetim','linux']})
df5

Unnamed: 0,grup,yetenekler
0,Muhasebe,matematik
1,Muhasebe,excel
2,Muhendislik,kodlama
3,IK,excel
4,İK,yönetim
5,Muhendislik,linux


In [55]:
df1

Unnamed: 0,calisanlar,grup
0,Ceyda,Muhasebe
1,Bugra,Muhendislik
2,Huseyin,Muhendislik
3,Can,İK


In [56]:
pd.merge(df1, df5)

Unnamed: 0,calisanlar,grup,yetenekler
0,Ceyda,Muhasebe,matematik
1,Ceyda,Muhasebe,excel
2,Bugra,Muhendislik,kodlama
3,Bugra,Muhendislik,linux
4,Huseyin,Muhendislik,kodlama
5,Huseyin,Muhendislik,linux
6,Can,İK,yönetim


# **Toplulaştırma ve Gruplama (Aggregation & Grouping)**

**Basit toplulaştırma fonksiyonlar:**

*count()

*first()

*last()

*mean()

*median()

*min()

*max()

*std()

*var()

*sum()

In [57]:
import seaborn as sns

In [59]:
df = sns.load_dataset("planets")
df

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [60]:
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [61]:
df.shape

(1035, 6)

In [62]:
df.count()

method            1035
number            1035
orbital_period     992
mass               513
distance           808
year              1035
dtype: int64

In [64]:
df["mass"].mean()

2.6381605847953233

In [65]:
df["mass"].count()

513

In [67]:
df["mass"].max()

25.0

In [68]:
df["mass"].min()

0.0036

In [69]:
df["mass"].std()

3.8186166509616046

In [70]:
df["mass"].sum()

1353.37638

In [71]:
df["mass"].var()

14.58183312700122

In [None]:
#describe: tek tek ele almak yerine bir arada hepsini görmek istiyorsak

In [72]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [73]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,1035.0,1.785507,1.240976,1.0,1.0,1.0,2.0,7.0
orbital_period,992.0,2002.917596,26014.728304,0.090706,5.44254,39.9795,526.005,730000.0
mass,513.0,2.638161,3.818617,0.0036,0.229,1.26,3.04,25.0
distance,808.0,264.069282,733.116493,1.35,32.56,55.25,178.5,8500.0
year,1035.0,2009.070531,3.972567,1989.0,2007.0,2010.0,2012.0,2014.0


In [75]:
#dropna(): eksik değerleri ele almadan hesapla

#T: Transpozunu al demek

In [76]:
df.dropna().describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,498.0,1.73494,1.17572,1.0,1.0,1.0,2.0,6.0
orbital_period,498.0,835.778671,1469.128259,1.3283,38.27225,357.0,999.6,17337.5
mass,498.0,2.50932,3.636274,0.0036,0.2125,1.245,2.8675,25.0
distance,498.0,52.068213,46.596041,1.35,24.4975,39.94,59.3325,354.0
year,498.0,2007.37751,4.167284,1989.0,2005.0,2009.0,2011.0,2014.0


# **Gruplama İşlemleri**

In [79]:
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'veri': ['11','17','23','55','43','77']},columns = ['gruplar','veri'])
df

Unnamed: 0,gruplar,veri
0,A,11
1,B,17
2,C,23
3,A,55
4,B,43
5,C,77


In [80]:
df.groupby("gruplar")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FD1D4D1220>

In [81]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,577.5
B,871.5
C,1188.5


In [82]:
df.groupby("gruplar").sum()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,1155
B,1743
C,2377


In [83]:
df = sns.load_dataset("planets")
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [84]:
df.groupby("method")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FD1D4B2DC0>

In [86]:
df.groupby("method")["orbital_period"].sum()

method
Astrometry                       1.262360e+03
Eclipse Timing Variations        4.276480e+04
Imaging                          1.418973e+06
Microlensing                     2.207500e+04
Orbital Brightness Modulation    2.127920e+00
Pulsar Timing                    3.671511e+04
Pulsation Timing Variations      1.170000e+03
Radial Velocity                  4.553151e+05
Transit                          8.377523e+03
Transit Timing Variations        2.393505e+02
Name: orbital_period, dtype: float64

In [87]:
df.groupby("method")["mass"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,0.0,,,,,,,
Eclipse Timing Variations,2.0,5.125,1.308148,4.2,4.6625,5.125,5.5875,6.05
Imaging,0.0,,,,,,,
Microlensing,0.0,,,,,,,
Orbital Brightness Modulation,0.0,,,,,,,
Pulsar Timing,0.0,,,,,,,
Pulsation Timing Variations,0.0,,,,,,,
Radial Velocity,510.0,2.630699,3.825883,0.0036,0.22525,1.26,3.0,25.0
Transit,1.0,1.47,,1.47,1.47,1.47,1.47,1.47
Transit Timing Variations,0.0,,,,,,,


# **İleri Toplulaştırma İşlemleri (Aggregate, filter, transform, apply)**

In [90]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [91]:
#aggregate

In [92]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16.0,234.0
B,17.0,456.0
C,66.0,612.5


In [94]:
df.groupby("gruplar").aggregate([min, np.median,max])

Unnamed: 0_level_0,degisken1,degisken1,degisken1,degisken2,degisken2,degisken2
Unnamed: 0_level_1,min,median,max,min,median,max
gruplar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,16.0,22,100,234.0,368
B,11,17.0,23,456,456.0,456
C,33,66.0,99,369,612.5,856


In [95]:
df.groupby("gruplar").aggregate({"degisken1": "min", "degisken2" : "max"})

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,368
B,11,456
C,33,856


In [96]:
#filter

In [106]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [107]:
def filter_func(x):
    return x["degisken1"].std() > 9

In [109]:
df.groupby("gruplar").std()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8.485281,189.504617
B,8.485281,0.0
C,46.669048,344.361002


In [108]:
df.groupby("gruplar").filter(filter_func)

Unnamed: 0,gruplar,degisken1,degisken2
2,C,33,856
5,C,99,369


In [3]:
#transform

In [4]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [4]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [4]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [4]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [4]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [4]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [5]:
df["degisken1"]*9

0     90
1    207
2    297
3    198
4     99
5    891
Name: degisken1, dtype: int64

In [7]:
df_a = df.iloc[:, 1:3]

In [10]:
df_a.transform(lambda x: x-x.mean() /x.std()) #lambda: bir fonk isimlendirmesi yapmadan fonk oluşturmaya yarar

Unnamed: 0,degisken1,degisken2
0,9.013055,98.225121
1,22.013055,454.225121
2,32.013055,854.225121
3,21.013055,366.225121
4,10.013055,454.225121
5,98.013055,367.225121


In [14]:
#apply

tıpkı transform fonk ve filter fonk gibi dataframein değişkenlerin üzerinde gezinme yeteneği olan ve toplulaştırma amacıyla kullanılabilcek fonksiyondur.

In [16]:
import pandas as pd
import numpy as np
df = pd.DataFrame({
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['degisken1','degisken2'])
df

Unnamed: 0,degisken1,degisken2
0,10,100
1,23,456
2,33,856
3,22,368
4,11,456
5,99,369


In [17]:
df.apply(np.sum)

degisken1     198
degisken2    2605
dtype: int64

In [18]:
df.apply(np.sum)

degisken1     198
degisken2    2605
dtype: int64

In [19]:
df.apply(np.mean)

degisken1     33.000000
degisken2    434.166667
dtype: float64

In [20]:
#aynı işlemi grup bazında yapsaydık nasıl olurdu

In [21]:
import pandas as pd
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                  'degisken1': [10,23,33,22,11,99],
                  'degisken2': [100,456,856,368,456,369]},
                 columns = ['gruplar','degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,456
2,C,33,856
3,A,22,368
4,B,11,456
5,C,99,369


In [22]:
df.groupby("gruplar").apply(np.sum)

Unnamed: 0_level_0,gruplar,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,AA,32,468
B,BB,34,912
C,CC,132,1225


In [33]:
df.groupby("gruplar").apply(np.mean)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16.0,234.0
B,17.0,456.0
C,66.0,612.5


# **Pivot Tablolar**

In [None]:
#groupby'ın çok boyutlu versiyonu olarak düşünülebilir

In [34]:
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [36]:
titanic.groupby("sex")[["survived"]].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [None]:
#unstack(): hiyerarşik index yapısını çözmek için kullanılır

In [39]:
titanic.groupby(["sex","class"])[["survived"]].aggregate("mean").unstack()

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [42]:
#unstack() kullanmadan elde edilen çıktı

In [43]:
titanic.groupby(["sex","class"])[["survived"]].aggregate("mean")

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,class,Unnamed: 2_level_1
female,First,0.968085
female,Second,0.921053
female,Third,0.5
male,First,0.368852
male,Second,0.157407
male,Third,0.135447


In [44]:
#pivot ile table

In [45]:
titanic.pivot_table("survived", index = "sex", columns = "class")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [47]:
titanic.age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [49]:
age = pd.cut(titanic["age"], [0,18,90])
age.head(10)

0    (18.0, 90.0]
1    (18.0, 90.0]
2    (18.0, 90.0]
3    (18.0, 90.0]
4    (18.0, 90.0]
5             NaN
6    (18.0, 90.0]
7     (0.0, 18.0]
8    (18.0, 90.0]
9     (0.0, 18.0]
Name: age, dtype: category
Categories (2, interval[int64, right]): [(0, 18] < (18, 90]]

In [50]:
titanic.pivot_table("survived", ["sex", age], "class")

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 90]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 90]",0.375,0.071429,0.133663


# **Dış Kaynaklı Veri Okumak**

In [51]:
import pandas as pd

In [59]:
?pd.read_csv

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mread_csv[0m[1;33m([0m[1;33m
[0m    [0mfilepath_or_buffer[0m[1;33m:[0m [1;34m'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]'[0m[1;33m,[0m[1;33m
[0m    [0msep[0m[1;33m=[0m[1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mdelimiter[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mheader[0m[1;33m=[0m[1;34m'infer'[0m[1;33m,[0m[1;33m
[0m    [0mnames[0m[1;33m=[0m[1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mindex_col[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0musecols[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msqueeze[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mprefix[0m[1;33m=[0m[1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mmangle_dupe_cols[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mdtype[0m[1;33m:[0m [1;34m'DtypeArg | None'[0m [1;33

In [54]:
#csv okuma
pd.read_csv("ornekcsv.csv", sep = ";")

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [57]:
#txt okuma
pd.read_csv("duz_metin.txt")

Unnamed: 0,1 2
0,2 2
1,3 2
2,4 2
3,5 2
4,6 2
5,7 2
6,8 2
7,9 2
8,10 2


In [61]:
#excel
pd.read_excel("ornekx.xlsx")

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0


In [62]:
df = pd.read_excel("ornekx.xlsx")

In [63]:
type(df)

pandas.core.frame.DataFrame

In [64]:
df.head()

Unnamed: 0,a,b,c
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0


In [66]:
df.columns = ("A","B","C")
df

Unnamed: 0,A,B,C
0,78,12,1.0
1,78,12,2.0
2,78,324,3.0
3,7,2,4.0
4,88,23,5.0
5,6,2,
6,56,11,6.0
7,7,12,7.0
8,56,21,7.0
9,346,2,8.0
